Example #1
0
    def checkCrossSections(self):
        # checks if all cross sections (apart from data)
        # were specified

        import re
        from flashgg.MetaData.samples_utils import SamplesManager
        from flashgg.MetaData.JobConfig import JobConfig

        # ignore any escaping etc.
        kwargs = {}
        for part in re.split('\s+', self.options.cmdLine):
            key, value = part.split('=',1)
            kwargs[key] = value

        if not kwargs.has_key('campaign'):
            print >> sys.stderr,"WARNING: campaign not set"

        # TODO: the following line is duplicated from
        #       JobConfig(..) but we can't make JobConfig.options.parseArguments()
        #       to parse anything other than sys.argv without modifying sys.argv temporarily...
        campaign = kwargs.get('campaign', "")

        jobConfig = JobConfig(**kwargs)

        sm = SamplesManager("$CMSSW_BASE/src/%s/MetaData/data/%s/datasets.json" % (jobConfig.metaDataSrc, campaign),
                            jobConfig.crossSections,
                            )

        hasProblem = False

        # loop over all types of processes (data, signal, background)
        for dsList in self.options.processes.values():
            for idsName in dsList:
                #if args were provided for this dataset, then it is a list...
                if isinstance(idsName, list):
                    dsName=idsName[0]
                #... or just a string
                else:
                    dsName=idsName

                # check if this datasets was selected
                if not self.isSelectedDataset(dsName):
                    # skip this dataset
                    continue

                #----------
                try:
                    name, xsec, totEvents, files, maxEvents = sm.getDatasetMetaData(jobConfig.maxEvents, dsName, jobId=-1, nJobs= jobConfig.nJobs)

                    if xsec == None:
                        print >> sys.stderr,"cross section for",dsName,"not found in",jobConfig.crossSections
                        hasProblem = True

                except Exception, ex:
                    if type(ex) == Exception and (
                        ex.message.startswith('No dataset matched the request:') or
                        ex.message.startswith('More then one dataset matched the request:')):
                        print >> sys.stderr, ex.message
                    else:
                        raise
Example #2
0
    def parse(self):

        if self.parsed:
            return

        self.options.parseArguments()
        #        print "parsing arguments, processIdnex is "+str(self.options.processIndex)
        self.processIndex = self.options.processIndex
        if self.options.processIdMap != "":
            self.readProcessIdMap(self.options.processIdMap)

        if self.useAAA:
            self.filePrepend = "root://xrootd-cms.infn.it/"
        elif self.useEOS:
            self.filePrepend = "root://eoscms//eos/cms"

        self.samplesMan = None
        dataset = None
        if self.dataset != "":
            print "Reading dataset (%s) %s" % (self.campaign, self.dataset)
            self.samplesMan = SamplesManager(
                "$CMSSW_BASE/src/%s/MetaData/data/%s/datasets.json" %
                (self.metaDataSrc, self.campaign),
                self.crossSections,
            )
            if self.dryRun and self.getMaxJobs:
                dataset = self.samplesMan.getDatasetMetaData(self.maxEvents,
                                                             self.dataset,
                                                             jobId=-1,
                                                             nJobs=self.nJobs)
            else:
                dataset = self.samplesMan.getDatasetMetaData(self.maxEvents,
                                                             self.dataset,
                                                             jobId=self.jobId,
                                                             nJobs=self.nJobs)
            if not dataset:
                print "Could not find dataset %s in campaing %s/%s" % (
                    self.dataset, self.metaDataSrc, self.campaing)
                sys.exit(-1)

        self.dataset = dataset
        # auto-detect data from xsec = 0
        if self.dataset:
            name, xsec, totEvents, files, maxEvents = self.dataset
            if type(xsec) != dict or type(xsec.get("xs", None)) != float:
                print "Warning: you are running on a dataset for which you specified no cross section: \n %s " % name
            else:
                if self.processType == "" and xsec["xs"] == 0.:
                    self.processType = "data"

            self.processId = self.getProcessId(name)

        outputFile = self.outputFile
        if self.jobId != -1:
            outputFile = "%s_%d.root" % (outputFile.replace(".root",
                                                            ""), self.jobId)
        self.setType("outputFile", VarParsing.VarParsing.varType.string)
        self.outputFile = outputFile

        self.parsed = True
Example #3
0
    def parse(self):

        if self.parsed:
            return

        self.options.parseArguments()
        if self.options.processIdMap != "":
            self.readProcessIdMap(self.options.processIdMap)

        if self.useAAA:
            self.filePrepend = "root://xrootd-cms.infn.it/"
        elif self.useEOS:
            self.filePrepend = "root://eoscms//eos/cms"

        dataset = None
        if self.dataset != "":
            print "JOBCONFIG", self.dataset
            print "Reading dataset (%s) %s" % (self.campaign, self.dataset)
            #MATTEO
            if self.dryRun and self.getMaxJobs:
                dataset = SamplesManager(
                    "$CMSSW_BASE/src/%s/MetaData/data/%s/datasets.json" %
                    (self.metaDataSrc, self.campaign),
                    self.crossSections,
                ).getDatasetMetaData(self.maxEvents,
                                     self.dataset,
                                     jobId=-1,
                                     nJobs=self.nJobs)
            else:
                dataset = SamplesManager(
                    "$CMSSW_BASE/src/%s/MetaData/data/%s/datasets.json" %
                    (self.metaDataSrc, self.campaign),
                    self.crossSections,
                ).getDatasetMetaData(self.maxEvents,
                                     self.dataset,
                                     jobId=self.jobId,
                                     nJobs=self.nJobs)
        self.dataset = dataset

        outputFile = self.outputFile
        if self.jobId != -1:
            outputFile = "%s_%d.root" % (outputFile.replace(".root",
                                                            ""), self.jobId)
        self.setType("outputFile", VarParsing.VarParsing.varType.string)
        self.outputFile = outputFile
Example #4
0
    def parse(self):

        if self.parsed:
            return

        self.options.parseArguments()
#        print "parsing arguments, processIdnex is "+str(self.options.processIndex)
        self.processIndex = self.options.processIndex
        if self.options.processIdMap != "":
            self.readProcessIdMap(self.options.processIdMap)
        
        if self.useAAA:
            self.filePrepend = "root://xrootd-cms.infn.it/"
        elif self.useEOS:
            if self.atIC:
                self.filePrepend = "root://eoscms.cern.ch//eos/cms"
            else:    
                self.filePrepend = "root://eoscms//eos/cms"
        
        self.samplesMan = None
        dataset = None
        if self.dataset != "":
            print "Reading dataset (%s) %s" % ( self.campaign, self.dataset)
            self.samplesMan = SamplesManager("$CMSSW_BASE/src/%s/MetaData/data/%s/datasets.json" % (self.metaDataSrc, self.campaign),
                                         self.crossSections,
                                         )
            if self.dryRun and self.getMaxJobs:
                dataset = self.samplesMan.getDatasetMetaData(self.maxEvents,self.dataset,jobId=-1,nJobs=self.nJobs)
            else:
                dataset = self.samplesMan.getDatasetMetaData(self.maxEvents,self.dataset,jobId=self.jobId,nJobs=self.nJobs)
            if not dataset: 
                print "Could not find dataset %s in campaing %s/%s" % (self.dataset,self.metaDataSrc,self.campaing)
                sys.exit(-1)
                
        self.dataset = dataset
        # auto-detect data from xsec = 0
        if self.dataset:
            name,xsec,totEvents,files,maxEvents = self.dataset            
            if type(xsec) != dict or type(xsec.get("xs",None)) != float:
                print "Warning: you are running on a dataset for which you specified no cross section: \n %s " % name
            else:
                if self.processType == "" and xsec["xs"] == 0.:
                    self.processType = "data"
                    
            self.processId = self.getProcessId(name)
            
        outputFile=self.outputFile
        if self.jobId != -1:
            outputFile = "%s_%d.root" % ( outputFile.replace(".root",""), self.jobId )
        self.setType ("outputFile", VarParsing.VarParsing.varType.string)
        self.outputFile = outputFile

        self.parsed=True
Example #5
0
class JobConfig(object):
    
    def __init__(self,*args,**kwargs):
        
        super(JobConfig,self).__init__()

        self.metaDataSrc=kwargs.get("metaDataSrc","flashgg")
        self.crossSections=kwargs.get("crossSections",["$CMSSW_BASE/src/flashgg/MetaData/data/cross_sections.json"])
        self.tfileOut=kwargs.get("tfileOut",None)

        self.processIdMap = {}

        self.options = VarParsing.VarParsing("analysis")
        ## self.options.setDefault ('maxEvents',100)
        self.options.register ('dataset',
                       "", # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.string,          # string, int, or float
                       "dataset")
        self.options.register ('processId',
                       "", # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.string,          # string, int, or float
                       "processId")
        self.options.register ('processIdMap',
                       "", # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.string,          # string, int, or float
                       "processIdMap")
        self.options.register ('processIndex',
                       None, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.int,          # string, int, or float
                       "processIndex")
        self.options.register ('secondaryDatasetInProcId',
                       False, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.bool,          # string, int, or float
                       "secondaryDatasetInProcId")
        self.options.register ('campaign',
                       "", # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.string,          # string, int, or float
                       "campaign")
        self.options.register ('useAAA',
                       False, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.bool,          # string, int, or float
                       "useAAA")
        self.options.register ('useEOS',
                       True, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.bool,          # string, int, or float
                       "useEOS")
        self.options.register ('targetLumi',
                       1.e+3, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.float,          # string, int, or float
                       "targetLumi")
        self.options.register ('nJobs',
                       0, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.int,          # string, int, or float
                       "nJobs")
        self.options.register ('jobId',
                               -1, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.int,          # string, int, or float
                               "jobId")
        self.options.register ('lastAttempt',
                       False, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.bool,          # string, int, or float
                       "lastAttempt")
        self.options.register ('lumiMask',
                       "", # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.string,          # string, int, or float
                       "lumiMask")
        self.options.register ('dryRun',
                       False, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.bool,          # string, int, or float
                       "dryRun")
        self.options.register ('dumpPython',
                       "", # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.string,          # string, int, or float
                       "dumpPython")
        self.options.register ('getMaxJobs',
                       False, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.bool,          # string, int, or float
                       "getMaxJobs")
        self.options.register ('processType',
                               "", # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "processType")
        self.options.register ('puTarget',
                               "", # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "puTarget")

        
        self.parsed = False
        
        
        from SimGeneral.MixingModule.mix_2015_25ns_Startup_PoissonOOTPU_cfi import mix as mix_2015_25ns
        from SimGeneral.MixingModule.mix_2015_50ns_Startup_PoissonOOTPU_cfi import mix as mix_2015_50ns
        self.pu_distribs = { "74X_mcRun2_asymptotic_v2" : mix_2015_25ns.input.nbPileupEvents }
        try:
            from SimGeneral.MixingModule.mix_2015_25ns_FallMC_matchData_PoissonOOTPU_cfi import mix as mix_2015_76_25ns
            self.pu_distribs["PU25nsData2015v1"] = mix_2015_76_25ns.input.nbPileupEvents
        except Exception:
            print "Failed to load 76X mixing, this is expected in 74X!"
            
        try:
            from SimGeneral.MixingModule.mix_2016_25ns_SpringMC_PUScenarioV1_PoissonOOTPU_cfi import mix as mix_2016_80_25ns
            self.pu_distribs["PUSpring16"] = mix_2016_80_25ns.input.nbPileupEvents
        except Exception:
            print "Failed to load 80X mixing, this is expected in 7X!"

        try:
            from SimGeneral.MixingModule.mix_2016_25ns_Moriond17MC_PoissonOOTPU_cfi import mix as mix_Moriond17
            self.pu_distribs["Summer16"] = mix_Moriond17.input.nbPileupEvents
            self.pu_distribs["PUMoriond17"] = mix_Moriond17.input.nbPileupEvents
            self.pu_distribs["upgrade2017"] = mix_Moriond17.input.nbPileupEvents
        except Exception:
            print "Failed to load Moriond17 mixing, this is expected in earlier releases"

            
    def __getattr__(self,name):
        ## did not manage to inherit from VarParsing, because of some issues in __init__
        ## this allows to use VarParsing methods on JobConfig
        if hasattr(self.options,name):
            return getattr(self.options,name)
        
        raise AttributeError
    
    def __call__(self,process):
        self.customize(process)
 
    # process customization
    def customize(self,process):
        self.parse()

        isFwlite = False
        hasOutput = False
        hasTFile = False
        sp_unused = ""
        if hasattr(process,"fwliteInput"):
            isFwlite = True
        if not isFwlite:
            hasOutput = hasattr(process,"out")            
            hasTFile = hasattr(process,"TFileService")
        
        if hasOutput and hasTFile:
            tfile = self.outputFile.replace(".root","_histos.root")
        else:
            tfile = self.outputFile
            
        if self.dryRun:
            import sys
            if self.dataset and self.dataset != "":
                name,xsec,totEvents,files,maxEvents,sp_unused = self.dataset
                if self.getMaxJobs:
                    print "maxJobs:%d" % ( min(len(files),self.nJobs) )                    
                if len(files) != 0:
                    if isFwlite:
                        print "hadd:%s" % self.outputFile
                    else:
                        if hasOutput:
                            print "edm:%s" % self.outputFile
                        if hasTFile or self.tfileOut:
                            print "hadd:%s" % tfile
                    ## sys.exit(0)
            else:
                sys.exit(1)
            
        files = self.inputFiles
        if self.dataset and self.dataset != "":
            dsetname,xsec,totEvents,files,maxEvents,sp_unused = self.dataset
            if type(xsec) == float or xsec == None:
                print 
                print "Error: cross section not found for dataset %s" % dsetname
                print
                
            self.maxEvents = int(maxEvents)
            
            putarget = None
            samplepu = None
            if self.puTarget != "":
                putarget = map(float, self.puTarget.split(","))
                
            processId = self.getProcessId(dsetname)
            self.processId = processId

            #----------

            if self.options.processIndex != None:
                self.processIndex = self.options.processIndex
            else:
                # not specified on the command line, try to take it 
                # from the cross section file, otherwise use smallest int32 as default value
                # in order not to confuse it with data (index 0)

                if isinstance(xsec, dict):
                    self.processIndex = xsec.get('itype', -0x7FFFFFFF)
                else:
                    # note that in some cases (process not defined in cross_sections.json ?)
                    # this can still be a float
                    self.processIndex = -0x7FFFFFFF

            #----------

            if isinstance(xsec, dict) and "itype" in xsec:
                for name,obj in process.__dict__.iteritems():
                    if hasattr(obj, "sampleIndex"):
                        obj.sampleIndex = xsec["itype"]

            
            isdata = self.processType == "data"
            if isdata or self.targetLumi > 0. or putarget:
                ## look for analyzers which have lumiWeight as attribute
                for name,obj in process.__dict__.iteritems():
                    
                    if hasattr(obj,"lumiWeight"):
                        if  isdata:
                            obj.lumiWeight = 1.
                        else:
                            wei = xsec["xs"]/float(totEvents)*self.targetLumi
                            wei *= xsec.get("br",1.)
                            wei *= xsec.get("kf",1.)
                            obj.lumiWeight = wei

                    if hasattr(obj,"intLumi"):
                        if isdata:
                            obj.intLumi= 0 # should not be used in final fits.
                            # setting to 0 will cause error if someone tries to use
                            #it for normalization downsteram
                        else:
                            obj.intLumi=self.targetLumi

                    if putarget and not isdata:
                        puObj = None
                        if hasattr(obj,"puReWeight"):
                            puObj = obj
                        elif hasattr(obj,"globalVariables") and hasattr(obj.globalVariables,"puReWeight"):
                            puObj = obj.globalVariables
                        if puObj:
                            if not samplepu:
                                matches = filter(lambda x: x in dsetname, self.pu_distribs.keys() )
                                print matches
                                if len(matches) > 1:
                                    print "Multiple matches, check if they're all the same"
                                    allsame = True
                                    for i in range(1,len(matches)):
                                        if self.pu_distribs[matches[0]] != self.pu_distribs[matches[i]]:
                                            allsame = False
                                    if allsame:
                                        print "They're all the same so we just take the 0th one:",matches[0]
                                        matches = [matches[0]]
                                    else:
                                        print "Not all the same... so we return to the old behavior and take an exact match, otherwise leave empty..."
                                        matches = filter(lambda x: x == dsetname, matches)
                                if len(matches) != 1:
                                    raise Exception("Could not determine sample pu distribution for reweighting. Possible matches are [%s]. Selected [%s]\n dataset: %s" % 
                                                ( ",".join(self.pu_distribs.keys()), ",".join(matches), dsetname ) )
                                samplepu = self.pu_distribs[matches[0]]
                            puObj.puReWeight = True
                            puObj.puBins = cms.vdouble( map(float, samplepu.probFunctionVariable) )
                            puObj.mcPu   = samplepu.probValue
                            puObj.dataPu = cms.vdouble(putarget)
                            puObj.useTruePu = cms.bool(True)
                        
                    
            for name,obj in process.__dict__.iteritems():
                if hasattr(obj,"processId"):
                    obj.processId = str(processId)

            for name,obj in process.__dict__.iteritems():
                if hasattr(obj,"processIndex"):
                    obj.processIndex = int(self.processIndex)
                    
            lumisToSkip = None
            if isdata:
                lumisToSkip = self.samplesMan.getLumisToSkip(dsetname)
                process.source.lumisToSkip = lumisToSkip.getVLuminosityBlockRange()

            if isdata and self.lumiMask != "":
                if isFwlite:
                    sys.exit("Lumi mask not supported in FWlite",-1)

                import FWCore.PythonUtilities.LumiList as LumiList
                target = LumiList.LumiList(filename = self.lumiMask)
                if lumisToSkip: 
                    target = target.__sub__(lumisToSkip)                    
                process.source.lumisToProcess = target.getVLuminosityBlockRange()

            if isdata:    
                print process.source.lumisToProcess
            
        flist = []
        for f in files:
            if len(f.split(":",1))>1:
                flist.append(str(f))
            else:
                flist.append(str("%s%s" % (self.filePrepend,f)))
        if len(flist) > 0:
            ## fwlite
            if isFwlite:
                ## process.fwliteInput.fileNames.extend([ str("%s%s" % (self.filePrepend,f)) for f in  files])
                process.fwliteInput.fileNames = flist
            ## full framework
            else:
                ## process.source.fileNames.extend([ str("%s%s" % (self.filePrepend,f)) for f in  files])
                process.source.fileNames = flist
 
        ## fwlite
        if isFwlite:
            process.fwliteInput.maxEvents = self.maxEvents
            process.fwliteOutput.fileName = self.outputFile
        ## full framework
        else:
            process.maxEvents.input = self.maxEvents
            
            if hasOutput:
                process.out.fileName = self.outputFile

            if hasTFile:
                process.TFileService.fileName = tfile
    
        if self.tfileOut:
            if hasTFile:
                print "Could not run with both TFileService and custom tfileOut"
                sys.exit(-1)
            name,attr = self.tfileOut
            setattr( getattr( process, name ), attr, tfile )
            

        if self.dumpPython != "":
            from gzip import open
            pyout = open("%s.gz" % self.dumpPython,"w+")
            pyout.write( process.dumpPython() )
            pyout.close()

    # parse command line and do post-processing
    def parse(self):

        if self.parsed:
            return

        self.options.parseArguments()
#        print "parsing arguments, processIdnex is "+str(self.options.processIndex)
        self.processIndex = self.options.processIndex
        if self.options.processIdMap != "":
            self.readProcessIdMap(self.options.processIdMap)
        
        if self.useAAA:
            self.filePrepend = "root://xrootd-cms.infn.it/"
        elif self.useEOS:
            self.filePrepend = "root://eoscms.cern.ch//eos/cms"
        
        self.samplesMan = None
        dataset = None
        if self.dataset != "":
            print "Reading dataset (%s) %s" % ( self.campaign, self.dataset)
            self.samplesMan = SamplesManager("$CMSSW_BASE/src/%s/MetaData/data/%s/datasets*.json" % (self.metaDataSrc, self.campaign),
                                         self.crossSections,
                                         )
            if self.dryRun and self.getMaxJobs:
                dataset = self.samplesMan.getDatasetMetaData(self.maxEvents,self.dataset,jobId=-1,nJobs=self.nJobs)
            else:
                dataset = self.samplesMan.getDatasetMetaData(self.maxEvents,self.dataset,jobId=self.jobId,nJobs=self.nJobs)
            if not dataset: 
                print "Could not find dataset %s in campaing %s/%s" % (self.dataset,self.metaDataSrc,self.campaing)
                sys.exit(-1)
                
        self.dataset = dataset
        # auto-detect data from xsec = 0
        if self.dataset:
            name,xsec,totEvents,files,maxEvents,specialPrepend = self.dataset
            if len(specialPrepend) > 0 and not self.useAAA:
                self.filePrepend = specialPrepend
            if type(xsec) != dict or type(xsec.get("xs",None)) != float:
                print "Warning: you are running on a dataset for which you specified no cross section: \n %s " % name
            else:
                if self.processType == "" and xsec["xs"] == 0.:
                    self.processType = "data"
                    
            self.processId = self.getProcessId(name)
            
        outputFile=self.outputFile
        if self.jobId != -1:
            outputFile = "%s_%d.root" % ( outputFile.replace(".root",""), self.jobId )
        self.setType ("outputFile", VarParsing.VarParsing.varType.string)
        self.outputFile = outputFile

        self.parsed=True

    def datasetName(self):
        if type(self.dataset) == tuple:
            return self.dataset[0]
        return self.dataset
    
    def getProcessId(self,name):
        return self.getProcessId_(name).replace("/","").replace("-","_")
    
    def getProcessId_(self,name):
        if self.processId != "":
            return self.processId
        
        ## print name, self.processIdMap
        if name in self.processIdMap:
            return self.processIdMap[name]

        primSet,secSet = name.rsplit("/")[1:3]
        if primSet in self.processIdMap:
            return self.processIdMap[primSet]

        primSet = "/"+primSet
        if primSet in self.processIdMap:
            return self.processIdMap[primSet]
        
        if self.secondaryDatasetInProcId:
            return primSet + "_" + secSet
        return primSet

    def readProcessIdMap(self,fname):
        
        with open(fname) as fin:
            import json

            cfg = json.loads(fin.read())
            
            processes = cfg["processes"]
            for key,val in processes.iteritems():
                for dst in val:
                    if type(dst) == list:
                        name = dst[0]
                    else:
                        name = dst
                    self.processIdMap[name] = key
            
            fin.close()
Example #6
0
    def checkCrossSections(self):
        # checks if all cross sections (apart from data)
        # were specified

        import re
        from flashgg.MetaData.samples_utils import SamplesManager
        from flashgg.MetaData.JobConfig import JobConfig

        # ignore any escaping etc.
        kwargs = {}
        for part in re.split('\s+', self.options.cmdLine):
            key, value = part.split('=', 1)
            kwargs[key] = value

        if not kwargs.has_key('campaign'):
            print >> sys.stderr, "WARNING: campaign not set"

        # automatically get metadata locations right for both high-mass and SM Higgs
        try:
            from diphotons.MetaData.JobConfig import customize
        except ImportError:
            from flashgg.MetaData.JobConfig import customize
        if not kwargs.has_key("metaDataSrc"):
            kwargs["metaDataSrc"] = customize.metaDataSrc
        if not kwargs.has_key("crossSections"):
            kwargs["crossSections"] = customize.crossSections

        # TODO: the following line is duplicated from
        #       JobConfig(..) but we can't make JobConfig.options.parseArguments()
        #       to parse anything other than sys.argv without modifying sys.argv temporarily...
        campaign = kwargs.get('campaign', "")

        jobConfig = JobConfig(**kwargs)

        sm = SamplesManager(
            "$CMSSW_BASE/src/%s/MetaData/data/%s/datasets.json" %
            (jobConfig.metaDataSrc, campaign),
            jobConfig.crossSections,
        )

        hasProblem = False

        # loop over all types of processes (data, signal, background)
        for dsList in self.options.processes.values():
            for idsName in dsList:
                #if args were provided for this dataset, then it is a list...
                if isinstance(idsName, list):
                    dsName = idsName[0]
                #... or just a string
                else:
                    dsName = idsName

                # check if this datasets was selected
                if not self.isSelectedDataset(dsName):
                    # skip this dataset
                    continue

                #----------
                try:
                    name, xsec, totEvents, files, maxEvents = sm.getDatasetMetaData(
                        jobConfig.maxEvents,
                        dsName,
                        jobId=-1,
                        nJobs=jobConfig.nJobs)

                    if xsec == None:
                        print >> sys.stderr, "cross section for", dsName, "not found in", jobConfig.crossSections
                        hasProblem = True

                except Exception, ex:
                    if type(ex) == Exception and (
                            ex.message.startswith(
                                'No dataset matched the request:')
                            or ex.message.startswith(
                                'More then one dataset matched the request:')):
                        print >> sys.stderr, ex.message
                    else:
                        raise
Example #7
0
class JobConfig(object):
    
    def __init__(self,*args,**kwargs):
        
        super(JobConfig,self).__init__()

        self.metaDataSrc=kwargs.get("metaDataSrc","flashgg")
        self.crossSections=kwargs.get("crossSections",["$CMSSW_BASE/src/flashgg/MetaData/data/cross_sections.json"])
        self.tfileOut=kwargs.get("tfileOut",None)

        self.processIdMap = {}

        self.options = VarParsing.VarParsing("analysis")
        ## self.options.setDefault ('maxEvents',100)
        self.options.register ('dataset',
                       "", # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.string,          # string, int, or float
                       "dataset")
        self.options.register ('processId',
                       "", # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.string,          # string, int, or float
                       "processId")
        self.options.register ('processIdMap',
                       "", # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.string,          # string, int, or float
                       "processIdMap")
        self.options.register ('processIndex',
                       None, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.int,          # string, int, or float
                       "processIndex")
        self.options.register ('secondaryDatasetInProcId',
                       False, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.bool,          # string, int, or float
                       "secondaryDatasetInProcId")
        self.options.register ('campaign',
                       "", # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.string,          # string, int, or float
                       "campaign")
        self.options.register ('useAAA',
                       False, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.bool,          # string, int, or float
                       "useAAA")
        self.options.register ('useEOS',
                       True, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.bool,          # string, int, or float
                       "useEOS")
        self.options.register ('targetLumi',
                       1.e+3, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.float,          # string, int, or float
                       "targetLumi")
        self.options.register ('nJobs',
                       0, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.int,          # string, int, or float
                       "nJobs")
        self.options.register ('jobId',
                               -1, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.int,          # string, int, or float
                               "jobId")
        self.options.register ('lastAttempt',
                       False, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.bool,          # string, int, or float
                       "lastAttempt")
        self.options.register ('lumiMask',
                       "", # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.string,          # string, int, or float
                       "lumiMask")
        self.options.register ('dryRun',
                       False, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.bool,          # string, int, or float
                       "dryRun")
        self.options.register ('dumpPython',
                       "", # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.string,          # string, int, or float
                       "dumpPython")
        self.options.register ('getMaxJobs',
                       False, # default value
                       VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                       VarParsing.VarParsing.varType.bool,          # string, int, or float
                       "getMaxJobs")
        self.options.register ('processType',
                               "", # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "processType")
        self.options.register ('puTarget',
                               "", # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "puTarget")

        
        self.parsed = False
        
        
        from SimGeneral.MixingModule.mix_2015_25ns_Startup_PoissonOOTPU_cfi import mix as mix_2015_25ns
        from SimGeneral.MixingModule.mix_2015_50ns_Startup_PoissonOOTPU_cfi import mix as mix_2015_50ns
        self.pu_distribs = { "74X_mcRun2_asymptotic_v2" : mix_2015_25ns.input.nbPileupEvents }
        try:
            from SimGeneral.MixingModule.mix_2015_25ns_FallMC_matchData_PoissonOOTPU_cfi import mix as mix_2015_76_25ns
            self.pu_distribs["PU25nsData2015v1"] = mix_2015_76_25ns.input.nbPileupEvents
        except Exception:
            print "Failed to load 76X mixing, this is expected in 74X!"
            
        try:
            from SimGeneral.MixingModule.mix_2016_25ns_SpringMC_PUScenarioV1_PoissonOOTPU_cfi import mix as mix_2016_80_25ns
            self.pu_distribs["PUSpring16"] = mix_2016_80_25ns.input.nbPileupEvents
        except Exception:
            print "Failed to load 80X mixing, this is expected in 7X!"

        try:
            from SimGeneral.MixingModule.mix_2016_25ns_Moriond17MC_PoissonOOTPU_cfi import mix as mix_Moriond17
            self.pu_distribs["Summer16"] = mix_Moriond17.input.nbPileupEvents
            self.pu_distribs["PUMoriond17"] = mix_Moriond17.input.nbPileupEvents
        except Exception:
            print "Failed to load Moriond17 mixing, this is expected in earlier releases"
            
    def __getattr__(self,name):
        ## did not manage to inherit from VarParsing, because of some issues in __init__
        ## this allows to use VarParsing methods on JobConfig
        if hasattr(self.options,name):
            return getattr(self.options,name)
        
        raise AttributeError
    
    def __call__(self,process):
        self.customize(process)
 
    # process customization
    def customize(self,process):
        self.parse()

        isFwlite = False
        hasOutput = False
        hasTFile = False
        if hasattr(process,"fwliteInput"):
            isFwlite = True
        if not isFwlite:
            hasOutput = hasattr(process,"out")            
            hasTFile = hasattr(process,"TFileService")
        
        if hasOutput and hasTFile:
            tfile = self.outputFile.replace(".root","_histos.root")
        else:
            tfile = self.outputFile
            
        if self.dryRun:
            import sys
            if self.dataset and self.dataset != "":
                name,xsec,totEvents,files,maxEvents = self.dataset
                if self.getMaxJobs:
                    print "maxJobs:%d" % ( min(len(files),self.nJobs) )                    
                if len(files) != 0:
                    if isFwlite:
                        print "hadd:%s" % self.outputFile
                    else:
                        if hasOutput:
                            print "edm:%s" % self.outputFile
                        if hasTFile or self.tfileOut:
                            print "hadd:%s" % tfile
                    ## sys.exit(0)
            else:
                sys.exit(1)
            

        files = self.inputFiles
        if self.dataset and self.dataset != "":
            dsetname,xsec,totEvents,files,maxEvents = self.dataset
            if type(xsec) == float:
                print 
                print "Error: cross section not found for dataset %s" % dsetname
                print
                
            self.maxEvents = int(maxEvents)
            
            putarget = None
            samplepu = None
            if self.puTarget != "":
                putarget = map(float, self.puTarget.split(","))
                
            processId = self.getProcessId(dsetname)
            self.processId = processId

            #----------

            if self.options.processIndex != None:
                self.processIndex = self.options.processIndex
            else:
                # not specified on the command line, try to take it 
                # from the cross section file, otherwise use smallest int32 as default value
                # in order not to confuse it with data (index 0)

                if isinstance(xsec, dict):
                    self.processIndex = xsec.get('itype', -0x7FFFFFFF)
                else:
                    # note that in some cases (process not defined in cross_sections.json ?)
                    # this can still be a float
                    self.processIndex = -0x7FFFFFFF

            #----------

            if isinstance(xsec, dict) and "itype" in xsec:
                for name,obj in process.__dict__.iteritems():
                    if hasattr(obj, "sampleIndex"):
                        obj.sampleIndex = xsec["itype"]

            
            isdata = self.processType == "data"
            if isdata or self.targetLumi > 0. or putarget:
                ## look for analyzers which have lumiWeight as attribute
                for name,obj in process.__dict__.iteritems():
                    
                    if hasattr(obj,"lumiWeight"):
                        if  isdata:
                            obj.lumiWeight = 1.
                        else:
                            wei = xsec["xs"]/float(totEvents)*self.targetLumi
                            wei *= xsec.get("br",1.)
                            wei *= xsec.get("kf",1.)
                            obj.lumiWeight = wei

                    if hasattr(obj,"intLumi"):
                        if isdata:
                            obj.intLumi= 0 # should not be used in final fits.
                            # setting to 0 will cause error if someone tries to use
                            #it for normalization downsteram
                        else:
                            obj.intLumi=self.targetLumi

                    if putarget and not isdata:
                        puObj = None
                        if hasattr(obj,"puReWeight"):
                            puObj = obj
                        elif hasattr(obj,"globalVariables") and hasattr(obj.globalVariables,"puReWeight"):
                            puObj = obj.globalVariables
                        if puObj:
                            if not samplepu:
                                matches = filter(lambda x: x in dsetname, self.pu_distribs.keys() )
                                print matches
                                if len(matches) > 1:
                                    print "Multiple matches, check if they're all the same"
                                    allsame = True
                                    for i in range(1,len(matches)):
                                        if self.pu_distribs[matches[0]] != self.pu_distribs[matches[i]]:
                                            allsame = False
                                    if allsame:
                                        print "They're all the same so we just take the 0th one:",matches[0]
                                        matches = [matches[0]]
                                    else:
                                        print "Not all the same... so we return to the old behavior and take an exact match, otherwise leave empty..."
                                        matches = filter(lambda x: x == dsetname, matches)
                                if len(matches) != 1:
                                    raise Exception("Could not determine sample pu distribution for reweighting. Possible matches are [%s]. Selected [%s]\n dataset: %s" % 
                                                ( ",".join(self.pu_distribs.keys()), ",".join(matches), dsetname ) )
                                samplepu = self.pu_distribs[matches[0]]
                            puObj.puReWeight = True
                            puObj.puBins = cms.vdouble( map(float, samplepu.probFunctionVariable) )
                            puObj.mcPu   = samplepu.probValue
                            puObj.dataPu = cms.vdouble(putarget)
                            puObj.useTruePu = cms.bool(True)
                        
                    
            for name,obj in process.__dict__.iteritems():
                if hasattr(obj,"processId"):
                    obj.processId = str(processId)

            for name,obj in process.__dict__.iteritems():
                if hasattr(obj,"processIndex"):
                    obj.processIndex = int(self.processIndex)
                    
            lumisToSkip = None
            if isdata:
                lumisToSkip = self.samplesMan.getLumisToSkip(dsetname)
                process.source.lumisToSkip = lumisToSkip.getVLuminosityBlockRange()

            if isdata and self.lumiMask != "":
                if isFwlite:
                    sys.exit("Lumi mask not supported in FWlite",-1)

                import FWCore.PythonUtilities.LumiList as LumiList
                target = LumiList.LumiList(filename = self.lumiMask)
                if lumisToSkip: 
                    target = target.__sub__(lumisToSkip)                    
                process.source.lumisToProcess = target.getVLuminosityBlockRange()

            if isdata:    
                print process.source.lumisToProcess
            
        flist = []
        for f in files:
            if len(f.split(":",1))>1:
                flist.append(str(f))
            else:
                flist.append(str("%s%s" % (self.filePrepend,f)))
        if len(flist) > 0:
            ## fwlite
            if isFwlite:
                ## process.fwliteInput.fileNames.extend([ str("%s%s" % (self.filePrepend,f)) for f in  files])
                process.fwliteInput.fileNames = flist
            ## full framework
            else:
                ## process.source.fileNames.extend([ str("%s%s" % (self.filePrepend,f)) for f in  files])
                process.source.fileNames = flist
 
        ## fwlite
        if isFwlite:
            process.fwliteInput.maxEvents = self.maxEvents
            process.fwliteOutput.fileName = self.outputFile
        ## full framework
        else:
            process.maxEvents.input = self.maxEvents
            
            if hasOutput:
                process.out.fileName = self.outputFile

            if hasTFile:
                process.TFileService.fileName = tfile
    
        if self.tfileOut:
            if hasTFile:
                print "Could not run with both TFileService and custom tfileOut"
                sys.exit(-1)
            name,attr = self.tfileOut
            setattr( getattr( process, name ), attr, tfile )
            

        if self.dumpPython != "":
            from gzip import open
            pyout = open("%s.gz" % self.dumpPython,"w+")
            pyout.write( process.dumpPython() )
            pyout.close()

    # parse command line and do post-processing
    def parse(self):

        if self.parsed:
            return

        self.options.parseArguments()
#        print "parsing arguments, processIdnex is "+str(self.options.processIndex)
        self.processIndex = self.options.processIndex
        if self.options.processIdMap != "":
            self.readProcessIdMap(self.options.processIdMap)
        
        if self.useAAA:
            self.filePrepend = "root://xrootd-cms.infn.it/"
        elif self.useEOS:
            self.filePrepend = "root://eoscms.cern.ch//eos/cms"
        
        self.samplesMan = None
        dataset = None
        if self.dataset != "":
            print "Reading dataset (%s) %s" % ( self.campaign, self.dataset)
            self.samplesMan = SamplesManager("$CMSSW_BASE/src/%s/MetaData/data/%s/datasets.json" % (self.metaDataSrc, self.campaign),
                                         self.crossSections,
                                         )
            if self.dryRun and self.getMaxJobs:
                dataset = self.samplesMan.getDatasetMetaData(self.maxEvents,self.dataset,jobId=-1,nJobs=self.nJobs)
            else:
                dataset = self.samplesMan.getDatasetMetaData(self.maxEvents,self.dataset,jobId=self.jobId,nJobs=self.nJobs)
            if not dataset: 
                print "Could not find dataset %s in campaing %s/%s" % (self.dataset,self.metaDataSrc,self.campaing)
                sys.exit(-1)
                
        self.dataset = dataset
        # auto-detect data from xsec = 0
        if self.dataset:
            name,xsec,totEvents,files,maxEvents = self.dataset            
            if type(xsec) != dict or type(xsec.get("xs",None)) != float:
                print "Warning: you are running on a dataset for which you specified no cross section: \n %s " % name
            else:
                if self.processType == "" and xsec["xs"] == 0.:
                    self.processType = "data"
                    
            self.processId = self.getProcessId(name)
            
        outputFile=self.outputFile
        if self.jobId != -1:
            outputFile = "%s_%d.root" % ( outputFile.replace(".root",""), self.jobId )
        self.setType ("outputFile", VarParsing.VarParsing.varType.string)
        self.outputFile = outputFile

        self.parsed=True

    def datasetName(self):
        if type(self.dataset) == tuple:
            return self.dataset[0]
        return self.dataset
    
    def getProcessId(self,name):
        return self.getProcessId_(name).replace("/","").replace("-","_")
    
    def getProcessId_(self,name):
        if self.processId != "":
            return self.processId
        
        ## print name, self.processIdMap
        if name in self.processIdMap:
            return self.processIdMap[name]

        primSet,secSet = name.rsplit("/")[1:3]
        if primSet in self.processIdMap:
            return self.processIdMap[primSet]

        primSet = "/"+primSet
        if primSet in self.processIdMap:
            return self.processIdMap[primSet]
        
        if self.secondaryDatasetInProcId:
            return primSet + "_" + secSet
        return primSet

    def readProcessIdMap(self,fname):
        
        with open(fname) as fin:
            import json

            cfg = json.loads(fin.read())
            
            processes = cfg["processes"]
            for key,val in processes.iteritems():
                for dst in val:
                    if type(dst) == list:
                        name = dst[0]
                    else:
                        name = dst
                    self.processIdMap[name] = key
            
            fin.close()
Example #8
0
class JobConfig(object):
    
    def __init__(self,*args,**kwargs):
        
        super(JobConfig,self).__init__()

        self.crossSections=kwargs.get("crossSections",["$CMSSW_BASE/src/flashgg/MetaData/data/cross_sections.json"])
        self.tfileOut=kwargs.get("tfileOut",None)

        self.processIdMap = {}

        self.options = VarParsing.VarParsing("analysis")
        self.options.register ('metaDataSrc',
                               'flashgg', # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "metaDataSrc")
        self.options.register ('metaConditions',
                                "", # default value
                               VarParsing.VarParsing.multiplicity.list, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "metaConditions")        
        self.options.register ('dataset',
                               "", # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "dataset")
        self.options.register ('processId',
                               "", # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "processId")
        self.options.register ('processIdMap',
                               "", # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "processIdMap")
        self.options.register ('processIndex',
                               None, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.int,          # string, int, or float
                               "processIndex")
        self.options.register ('secondaryDatasetInProcId',
                               False, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.bool,          # string, int, or float
                               "secondaryDatasetInProcId")
        self.options.register ('campaign',
                               "", # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "campaign")
        self.options.register ('useAAA',
                               True, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.bool,          # string, int, or float
                               "useAAA")
        self.options.register ('useEOS',
                               False, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.bool,          # string, int, or float
                               "useEOS")
        self.options.register ('copyInputMicroAOD',
                               False, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.bool,          # string, int, or float
                               "copyInputMicroAOD")
        self.options.register ('useParentDataset',
                               False, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.bool,          # string, int, or float
                               "useParentDataset")
        self.options.register ('recalculatePDFWeights',
                               False, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.bool,          # string, int, or float
                               "recalculatePDFWeights")
        self.options.register ('secondaryDataset',
                               "", # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,         # string, int, or float
                               "secondaryDataset")
        self.options.register ('targetLumi',
                               1.e+3, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.float,          # string, int, or float
                               "targetLumi")
        self.options.register ('nJobs',
                               0, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.int,          # string, int, or float
                               "nJobs")
        self.options.register ('jobId',
                               -1, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.int,          # string, int, or float
                               "jobId")
        self.options.register ('lastAttempt',
                               False, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.bool,          # string, int, or float
                               "lastAttempt")
        self.options.register ('lumiMask',
                               "", # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "lumiMask")
        self.options.register ('dryRun',
                               False, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.bool,          # string, int, or float
                               "dryRun")
        self.options.register ('dumpPython',
                               "", # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "dumpPython")
        self.options.register ('getMaxJobs',
                               False, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.bool,          # string, int, or float
                               "getMaxJobs")
        self.options.register ('processType',
                               "", # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "processType")
        self.options.register ('puTarget',
                               "", # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "puTarget")
        self.options.register ('WeightName', # for THQ/THW samples the LHE weight should be mentioned
                               None, # default value
                               VarParsing.VarParsing.multiplicity.singleton, # singleton or list
                               VarParsing.VarParsing.varType.string,          # string, int, or float
                               "WeightName")
        
        self.parsed = False        
        
        from SimGeneral.MixingModule.mix_2015_25ns_Startup_PoissonOOTPU_cfi import mix as mix_2015_25ns
        from SimGeneral.MixingModule.mix_2015_50ns_Startup_PoissonOOTPU_cfi import mix as mix_2015_50ns
        self.pu_distribs = { "74X_mcRun2_asymptotic_v2" : mix_2015_25ns.input.nbPileupEvents }
        try:
            from SimGeneral.MixingModule.mix_2015_25ns_FallMC_matchData_PoissonOOTPU_cfi import mix as mix_2015_76_25ns
            self.pu_distribs["PU25nsData2015v1"] = mix_2015_76_25ns.input.nbPileupEvents
        except Exception:
            print "Failed to load 76X mixing, this is expected in 74X!"
            
        try:
            from SimGeneral.MixingModule.mix_2016_25ns_SpringMC_PUScenarioV1_PoissonOOTPU_cfi import mix as mix_2016_80_25ns
            self.pu_distribs["PUSpring16"] = mix_2016_80_25ns.input.nbPileupEvents
        except Exception:
            print "Failed to load 80X mixing, this is expected in 7X!"

        try:
            from SimGeneral.MixingModule.mix_2016_25ns_Moriond17MC_PoissonOOTPU_cfi import mix as mix_Moriond17
            self.pu_distribs["Summer16"] = mix_Moriond17.input.nbPileupEvents
            self.pu_distribs["PUMoriond17"] = mix_Moriond17.input.nbPileupEvents
            self.pu_distribs["upgrade2017"] = mix_Moriond17.input.nbPileupEvents
        except Exception:
            print "Failed to load Moriond17 mixing, this is expected in earlier releases"
            
        try:
            from flashgg.MetaData.mix_2017MCv2_DYJetsToLL import mix as mix_94X_mc2017
            #from flashgg.MetaData.mix_2017MCv2_GJet_Combined import mix as mix_94X_mc2017
            self.pu_distribs["94X_mc2017"] = mix_94X_mc2017.input.nbPileupEvents
        except Exception:
            print "Failed to load 94X_mc2017 mixing"
        try:
            from SimGeneral.MixingModule.mix_2018_25ns_JuneProjectionFull18_PoissonOOTPU_cfi import mix as mix_Autumn18
            self.pu_distribs["Autumn18"] = mix_Autumn18.input.nbPileupEvents
        except Exception:
            print "Failed to load Autumn18 mixing"
            
        #self.pu_distribs_hack_2017 = {  }

        # try:
        #     import importlib
        #     from os import listdir,environ
        #     mixdir = "PU_MixFiles_2017_miniaodv2_310"
        #     thedir = "%s/src/flashgg/MetaData/python/%s" % (environ['CMSSW_BASE'],mixdir)
        #     print thedir
        #     for fn in listdir(thedir):
        #         print fn
        #         if fn.startswith("mix_2017MC_") and fn.endswith(".py"):
        #             mn = fn[:-3]
        #             print fn,mn
        #             m = importlib.import_module("flashgg.MetaData.%s.%s" % (mixdir,mn))
        #             kn = mn.replace("mix_2017MC_","")
        #             self.pu_distribs_hack_2017[kn] = m.mix.input.nbPileupEvents
        # except Exception,e:
        #     print "failed to load hacky 94X mixing by dataset"
        #     raise e
            
    def __getattr__(self,name):
        ## did not manage to inherit from VarParsing, because of some issues in __init__
        ## this allows to use VarParsing methods on JobConfig
        if hasattr(self.options,name):
            return getattr(self.options,name)
        
        raise AttributeError
    
    def __call__(self,process):
        self.customize(process)
 
    # process customization
    def customize(self,process):
        self.parse()

        # keep useParent and secondaryDataset as exclusive options for the moment
        if self.options.useParentDataset and self.options.secondaryDataset != "":
            raise Exception("useParentDataset cannot be set together with a secondaryDataset")

        isFwlite = False
        hasOutput = False
        hasTFile = False
        sp_unused = ""
        if hasattr(process,"fwliteInput"):
            isFwlite = True
        if not isFwlite:
            hasOutput = hasattr(process,"out")            
            hasTFile = hasattr(process,"TFileService")
        
        if hasOutput and hasTFile:
            tfile = self.outputFile.replace(".root","_histos.root")
        else:
            tfile = self.outputFile
            
        if self.dryRun:
            import sys
            if self.dataset and self.dataset != "":
                name,xsec,totEvents,files,maxEvents,sp_unused = self.dataset
                if self.getMaxJobs:
                    print "maxJobs:%d" % ( min(len(files),self.nJobs) )                    
                if len(files) != 0:
                    if isFwlite:
                        print "hadd:%s" % self.outputFile
                    else:
                        if hasOutput:
                            print "edm:%s" % self.outputFile
                        if hasTFile or self.tfileOut:
                            print "hadd:%s" % tfile
                    ## sys.exit(0)
            else:
                sys.exit(1)
        
                
        files = self.inputFiles
        if self.dataset and self.dataset != "":
            dsetname,xsec,totEvents,files,maxEvents,sp_unused = self.dataset
            if type(xsec) == float or xsec == None:
                print 
                print "Error: cross section not found for dataset %s" % dsetname
                print
                
            putarget = None
            samplepu = None
            if self.puTarget != "":
                putarget = map(float, self.puTarget.split(","))
                
            processId = self.getProcessId(dsetname)
            self.processId = processId

            #----------

            if self.options.processIndex != None:
                self.processIndex = self.options.processIndex
            else:
                # not specified on the command line, try to take it 
                # from the cross section file, otherwise use smallest int32 as default value
                # in order not to confuse it with data (index 0)

                if isinstance(xsec, dict):
                    self.processIndex = xsec.get('itype', -0x7FFFFFFF)
                else:
                    # note that in some cases (process not defined in cross_sections.json ?)
                    # this can still be a float
                    self.processIndex = -0x7FFFFFFF

            #----------

            if isinstance(xsec, dict) and "itype" in xsec:
                for name,obj in process.__dict__.iteritems():
                    if hasattr(obj, "sampleIndex"):
                        obj.sampleIndex = xsec["itype"]

            
            isdata = self.processType == "data"
            if isdata or self.targetLumi > 0. or putarget:
                ## look for analyzers which have lumiWeight as attribute
                for name,obj in process.__dict__.iteritems():
                    
                    if hasattr(obj,"lumiWeight"):
                        if  isdata:
                            obj.lumiWeight = 1.
                        else:
                            wei = xsec["xs"]/float(totEvents)*self.targetLumi
                            wei *= xsec.get("br",1.)
                            wei *= xsec.get("kf",1.)
                            obj.lumiWeight = wei

                    if hasattr(obj,"intLumi"):
                        if isdata:
                            obj.intLumi= 0 # should not be used in final fits.
                            # setting to 0 will cause error if someone tries to use
                            #it for normalization downsteram
                        else:
                            obj.intLumi=self.targetLumi

                    if putarget and not isdata:
                        puObj = None
                        if hasattr(obj,"puReWeight"):
                            puObj = obj
                        elif hasattr(obj,"globalVariables") and hasattr(obj.globalVariables,"puReWeight"):
                            puObj = obj.globalVariables
                        if puObj:
                            if not samplepu:
#                                print dsetname
#                                print self.pu_distribs.keys()
#                                hack2017 = True
#                                found_hack2017 = False
#                                if hack2017:
                                # if self.options.PUyear=="2017":
                                #     print dsetname.split("/")[1]
                                #    # print self.pu_distribs.keys()
                                #     print self.pu_distribs_hack_2017.keys()
                                #    # matches = filter(lambda x: x == dsetname.split("/")[1],self.pu_distribs.keys())
                                #     matches = filter(lambda x: x == dsetname.split("/")[1],self.pu_distribs_hack_2017.keys())
                                #     if len(matches) == 1:
                                #         found_hack2017 = True
                                #         print "FOUND HACK2017 PILEUP DISTRIBUTION WITH KEY:",matches[0]
                                # if not found_hack2017:
                                matches = filter(lambda x: x in dsetname, self.pu_distribs.keys() )
                                print matches
                                if len(matches) > 1:
                                    print "Multiple matches, check if they're all the same"
                                    allsame = True
                                    for i in range(1,len(matches)):
                                        if self.pu_distribs[matches[0]] != self.pu_distribs[matches[i]]:
                                            allsame = False
                                    if allsame:
                                        print "They're all the same so we just take the 0th one:",matches[0]
                                        matches = [matches[0]]
                                    else:
                                        print "Not all the same... so we return to the old behavior and take an exact match, otherwise leave empty..."
                                        matches = filter(lambda x: x == dsetname, matches)
                                if len(matches) != 1:
                                    raise Exception("Could not determine sample pu distribution for reweighting. Possible matches are [%s]. Selected [%s]\n dataset: %s" % 
                                                        ( ",".join(self.pu_distribs.keys()), ",".join(matches), dsetname ) )
                                # if self.options.PUyear=="2017": samplepu = self.pu_distribs_hack_2017[matches[0]]
                                # else :
                            samplepu = self.pu_distribs[matches[0]]
                            puObj.puReWeight = True
                            puObj.puBins = cms.vdouble( map(float, samplepu.probFunctionVariable) )
                            puObj.mcPu   = samplepu.probValue
                            puObj.dataPu = cms.vdouble(putarget)
                            puObj.useTruePu = cms.bool(True)
                    
            for name,obj in process.__dict__.iteritems():
                if hasattr(obj,"processId"):
                    obj.processId = str(processId)

            for name,obj in process.__dict__.iteritems():
                if hasattr(obj,"processIndex"):
                    obj.processIndex = int(self.processIndex)
                    
            lumisToSkip = None
            if isdata:
                lumisToSkip = self.samplesMan.getLumisToSkip(dsetname)
                process.source.lumisToSkip = lumisToSkip.getVLuminosityBlockRange()

            if isdata and self.lumiMask != "":
                if isFwlite:
                    sys.exit("Lumi mask not supported in FWlite",-1)

                import FWCore.PythonUtilities.LumiList as LumiList
                target = LumiList.LumiList(filename = self.lumiMask)
                if lumisToSkip: 
                    target = target.__sub__(lumisToSkip)                    
                process.source.lumisToProcess = target.getVLuminosityBlockRange()

                print process.source.lumisToProcess

        flist = []
        sflist = []

        # get the runs and lumis contained in each file of the secondary dataset
        if self.options.secondaryDataset:
            secondary_files = [fdata['file'][0]['name'] for fdata in das_query("file dataset=%s instance=prod/phys03" % self.options.secondaryDataset, 
                                                                               cmd='dasgoclient --dasmaps=./')['data']]
            runs_and_lumis = {}
            for s in secondary_files:
                runs_and_lumis[str(s)] = {data['lumi'][0]['run_number'] : data['lumi'][0]['lumi_section_num']
                                          for data in das_query("lumi file=%s instance=prod/phys03" % s, cmd='dasgoclient --dasmaps=./')['data']}

        for f in files:
            if len(f.split(":",1))>1:
                flist.append(str(f))
            else:
                flist.append(str("%s%s" % (self.filePrepend,f)))
            # keep useParent and secondaryDataset as exclusive options for the moment
            if self.options.useParentDataset:
                parent_files = das_query("parent file=%s instance=prod/phys03" % f, cmd='dasgoclient --dasmaps=./')['data']
                for parent_f in parent_files:
                    parent_f_name = str(parent_f['parent'][0]['name'])
                    sflist.append('root://cms-xrd-global.cern.ch/'+parent_f_name if 'root://' not in parent_f_name else parent_f_name)
            elif self.options.secondaryDataset != "":
                # match primary file to the corresponding secondary file(s)
                f_runs_and_lumis = {data['lumi'][0]['run_number'] : data['lumi'][0]['lumi_section_num']
                                    for data in das_query("lumi file=%s instance=prod/phys03" % f, cmd='dasgoclient --dasmaps=./')['data']}
                for s_name, s_runs_and_lumis in runs_and_lumis.items():
                    matched_runs = set(f_runs_and_lumis.keys()).intersection(s_runs_and_lumis.keys())
                    for run in matched_runs:
                        if any(lumi in f_runs_and_lumis[run] for lumi in s_runs_and_lumis[run]):
                            sflist.append(s_name)

        ## mitigate server glitches by copying the input files (microAOD) on the worker node
        if self.copyInputMicroAOD and not self.dryRun:
            commands.getstatusoutput('mkdir -p input_files/')
            for i,f in enumerate(flist):
                status, out = commands.getstatusoutput('xrdcp %s ./input_files/'%f)
                print(out)
                flocal = 'file:./input_files/'+f.split('/')[-1]
                flist[i] = flocal

        if len(flist) > 0:
            ## fwlite
            if isFwlite:
                ## process.fwliteInput.fileNames.extend([ str("%s%s" % (self.filePrepend,f)) for f in  files])
                process.fwliteInput.fileNames = flist
            ## full framework
            else:
                ## process.source.fileNames.extend([ str("%s%s" % (self.filePrepend,f)) for f in  files])
                process.source.fileNames = flist
                if len(sflist) > 0:
                    process.source.secondaryFileNames = cms.untracked.vstring(sflist)
 
        ## fwlite
        if isFwlite:
            process.fwliteInput.maxEvents = self.options.maxEvents
            process.fwliteOutput.fileName = self.outputFile
        ## full framework
        else:
            process.maxEvents.input = self.options.maxEvents
            
            if hasOutput:
                process.out.fileName = self.outputFile

            if hasTFile:
                process.TFileService.fileName = tfile
    
        if self.tfileOut:
            if hasTFile:
                print "Could not run with both TFileService and custom tfileOut"
                sys.exit(-1)
            name,attr = self.tfileOut
            setattr( getattr( process, name ), attr, tfile )
            

        if self.dumpPython != "":
            from gzip import open
            pyout = open("%s.gz" % self.dumpPython,"w+")
            pyout.write( process.dumpPython() )
            pyout.close()

    # parse command line and do post-processing
    def parse(self):

        if self.parsed:
            return

        self.options.parseArguments()
#        print "parsing arguments, processIdnex is "+str(self.options.processIndex)
        self.processIndex = self.options.processIndex
        if self.options.processIdMap != "":
            self.readProcessIdMap(self.options.processIdMap)

        if self.useAAA:
       #     self.filePrepend = "root://xrootd-cms.infn.it/"
            self.filePrepend = "root://cms-xrd-global.cern.ch/"
        elif self.useEOS:
            self.filePrepend = "root://eoscms.cern.ch//eos/cms"

        self.samplesMan = None
        dataset = None
        if self.dataset != "":
            #print "Reading dataset (%s) %s" % ( self.campaign, self.dataset)
            self.samplesMan = SamplesManager("$CMSSW_BASE/src/%s/MetaData/data/%s/datasets*.json" % (self.metaDataSrc, self.campaign),
                                         self.crossSections,
                                         )
            if self.dryRun and self.getMaxJobs:
                dataset = self.samplesMan.getDatasetMetaData(self.options.maxEvents,self.dataset,jobId=-1,nJobs=self.nJobs,weightName=self.WeightName)
            else:
                dataset = self.samplesMan.getDatasetMetaData(self.options.maxEvents,self.dataset,jobId=self.jobId,nJobs=self.nJobs,weightName=self.WeightName)

            if not dataset: 
                print "Could not find dataset %s in campaing %s/%s" % (self.dataset,self.metaDataSrc,self.campaing)
                sys.exit(-1)
        
        self.dataset = dataset
        # auto-detect data from xsec = 0
        if self.dataset:
            name,xsec,totEvents,files,maxEvents,specialPrepend = self.dataset
            if len(specialPrepend) > 0 and not self.useAAA:
                self.filePrepend = specialPrepend
            if type(xsec) != dict or type(xsec.get("xs",None)) != float:
                print "Warning: you are running on a dataset for which you specified no cross section: \n %s " % name
            else:
                if self.processType == "" and xsec["xs"] == 0.:
                    self.processType = "data"
                    
            self.processId = self.getProcessId(name)

        outputFile=self.outputFile
        if self.jobId != -1:
            outputFile = "%s_%d.root" % ( outputFile.replace(".root",""), self.jobId )
        self.setType ("outputFile", VarParsing.VarParsing.varType.string)
        self.outputFile = outputFile

        self.parsed=True

    def datasetName(self):
        if type(self.dataset) == tuple:
            return self.dataset[0]
        return self.dataset
    
    def getProcessId(self,name):
        return self.getProcessId_(name).replace("/","").replace("-","_")
    
    def getProcessId_(self,name):
        if self.processId != "":
            return self.processId
        
        ## print name, self.processIdMap
        if name in self.processIdMap:
            return self.processIdMap[name]

        primSet,secSet = name.rsplit("/")[1:3]
        if primSet in self.processIdMap:
            return self.processIdMap[primSet]

        primSet = "/"+primSet
        if primSet in self.processIdMap:
            return self.processIdMap[primSet]
        
        if self.secondaryDatasetInProcId:
            return primSet + "_" + secSet
        return primSet

    def readProcessIdMap(self,fname):
        
        with open(fname) as fin:
            import json

            cfg = json.loads(fin.read())
            
            processes = cfg["processes"]
            for key,val in processes.iteritems():
                for dst in val:
                    if type(dst) == list:
                        name = dst[0]
                    else:
                        name = dst
                    self.processIdMap[name] = key
            
            fin.close()