Ejemplo n.º 1
0
    def __init__(self, group, name, *paths, **kwargs):
        """Container class for CMSSW samples, e.g.:
       - group: DY (used to group similar samples in final output)
       - name:  DYJetsToLL_M-50 (used as shorthand and jobname)
       - path:  /DYJetsToLL_M-50_TuneCP5_13TeV-madgraphMLM-pythia8/RunIIAutumn18NanoAODv6_Nano25Oct2019_102X_mcRun2/NANOAODSIM
       - dtype: 'mc', 'data', 'embed'
    """

        # PATH
        LOG.insist(
            len(paths) >= 1, "Need at least one path to create a sample...")
        if len(paths) == 1 and isinstance(paths[0], list):
            paths = paths[0]
        for path in paths:
            LOG.insist(
                path.count('/') >= 3 and path.startswith('/'),
                "DAS path %r has wrong format. Need /SAMPLE/CAMPAIGN/FORMAT." %
                (path))
            #sample = '/'.join(line.split('/')[-3:])

        # DATA TYPE
        dtype = kwargs.get('dtype', None)
        dtypes = ['mc', 'data', 'embed']
        if dtype == None:  # automatic recognition
            path = paths[0]
            if 'Embed' in path:
                dtype = 'embed'
            elif path.endswith('SIM') or any(g in path
                                             for g in ['pythia', 'madgraph']):
                dtype = 'mc'
            elif re.search(r"/Run20\d\d", path):
                dtype = 'data'
            dtype = 'mc'  # TODO: remove
        LOG.insist(
            dtype in dtypes,
            "Given data type '%s' is not recongized! Please choose from %s..."
            % (dtype, ', '.join(dtypes)))

        # ATTRIBUTES
        self.group = group
        self.name = name
        self.paths = paths  # DAS dataset path
        self.dtype = dtype
        self.channels = kwargs.get('channel', None)
        self.channels = kwargs.get('channels', self.channels)
        self.storage = None
        self.storepath = kwargs.get('store',
                                    None)  # if stored elsewhere than DAS
        self.url = kwargs.get('url', None)  # URL if stored elsewhere
        self.dasurl = kwargs.get(
            'dasurl', None) or "root://cms-xrd-global.cern.ch/"  # URL for DAS
        self.blacklist = kwargs.get('blacklist', [])  # black list file
        self.instance = kwargs.get(
            'instance', 'prod/phys03' if path.endswith('USER') else
            'prod/global')  # if None, does not exist in DAS
        self.nfilesperjob = kwargs.get('nfilesperjob',
                                       -1)  # number of nanoAOD files per job
        self.maxevts = kwargs.get(
            'maxevtsperjob', -1)  # maximum number of events processed per job
        self.maxevts = kwargs.get(
            'maxevts',
            self.maxevts)  # maximum number of events processed per job
        self.extraopts = kwargs.get(
            'opts', []
        )  # extra options for analysis module, e.g. ['doZpt=1','tes=1.1']
        self.subtry = kwargs.get('subtry',
                                 0)  # to help keep track of resubmission
        self.jobcfg = kwargs.get('jobcfg',
                                 {})  # to help keep track of resubmission
        self.nevents = kwargs.get(
            'nevts', 0)  # number of nanoAOD events that can be processed
        self.nevents = kwargs.get('nevents',
                                  self.nevents)  # cache of number of events
        self.files = kwargs.get(
            'files', [])  # list of ROOT files, OR text file with list of files
        self.filenevts = {}  # cache of number of events for each file
        self.postfix = kwargs.get(
            'postfix',
            None) or ""  # post-fix (before '.root') for stored ROOT files
        self.era = kwargs.get('era', "")  # for expansion of $ERA variable
        self.dosplit = kwargs.get(
            'split',
            len(self.paths) >= 2)  # allow splitting (if multiple DAS datasets)
        self.verbosity = kwargs.get('verbosity',
                                    0)  # verbosity level for debugging
        self.refreshable = not self.files  # allow refresh on file list in getfiles()

        # ENSURE LIST
        if self.channels != None and not isinstance(self.channels, list):
            self.channels = [self.channels]
        if isinstance(self.extraopts, str):
            if ',' in self.extraopts:
                self.extraopts = self.extraopts.split(',')
            self.extraopts = [self.extraopts]

        # STORAGE & URL DEFAULTS
        if self.storepath:
            self.storepath = repkey(self.storepath,
                                    USER=_user,
                                    ERA=self.era,
                                    GROUP=self.group,
                                    SAMPLE=self.name)
            self.storage = getstorage(repkey(self.storepath,
                                             PATH=self.paths[0],
                                             DAS=self.paths[0]),
                                      ensure=False)
        if not self.dasurl:
            self.dasurl = self.url if (self.url in dasurls) else dasurls[0]
        if not self.url:
            if self.storepath:
                if self.storage.__class__.__name__ == 'Local':
                    self.url = ""  #root://cms-xrd-global.cern.ch/
                else:
                    self.url = self.storage.fileurl
            else:
                self.url = self.dasurl

        # GET FILE LIST FROM TEXT FILE
        if isinstance(self.files, str):
            self.loadfiles(self.files)
Ejemplo n.º 2
0
 def loadfiles(self,listname_,**kwargs):
   verbosity = LOG.getverbosity(self,kwargs)
   """Load filenames from text file for fast look up in future."""
   listname  = repkey(listname_,ERA=self.era,GROUP=self.group,SAMPLE=self.name)
   LOG.verb("loadfiles: listname=%r -> %r, len(files)=%d, len(filenevts)=%d"%(
     listname_,listname,len(self.files),len(self.filenevts)),verbosity,1)
   filenevts = self.filenevts
   nevents   = 0
   #listname = ensurefile(listname,fatal=False)
   filelist = [ ]
   paths = self.paths if '$PATH' in listname else [self.paths[0]]
   for path in paths:
     listname_ = repkey(listname,PATH=path.strip('/').replace('/','__'))
     if self.verbosity>=1:
       print ">>> Loading sample files from %r..."%(listname_)
     self.pathfiles[path] = [ ]
     if os.path.isfile(listname_):
       skip = False
       subpaths = [ ] # for sanity check
       with open(listname_,'r') as file:
         for line in file:
           line = line.strip().split() # split at space to allow comments at end
           if not line: continue
           line = line[0].strip() # remove spaces, consider only first part of the line
           if line[0]=='#': continue # do not consider comments
           #if line.endswith('.root'):
           if line.startswith("DASPATH="): # to keep track of multiple DAS data set paths
             path = line.split('=')[-1] # DAS data set path
             LOG.insist(path.count('/')>=3 and path.startswith('/'),
               "DAS path %r in %s has wrong format. Need /SAMPLE/CAMPAIGN/FORMAT..."%(path,listname_))
             if path in self.paths: # store file list for this path
               self.pathfiles[path] = [ ]
               subpaths.append(path)
               skip = False
             else: # do not store file list for this path
               skip = True
           else:
             if skip: continue # only load files for this sample's DAS dataset paths
             match = fevtsexp.match(line) # match $FILENAM(:NEVTS)
             if not match: continue
             infile = match.group(1)
             if match.group(2): # found nevents in filename
               nevts  = int(match.group(2))
               filenevts[infile] = nevts # store/cache in dictionary
               nevents += nevts
             filelist.append(infile)
             self.pathfiles[path].append(infile)
             if self.verbosity>=3:
               print ">>> %7d events for %s"%(nevts,infile)
       if not filelist:
         LOG.warning("loadfiles: Did not find any files in %s!"%(listname_))
         self.refreshable = True
       else: # sanity check for empty list
         for subpath in subpaths:
           if not self.pathfiles[subpath]:
             LOG.warning("loadfiles: Did not find any files for path %s in %s!"%(subpath,listname_))
     else:
       LOG.warning("loadfiles: file list %s does not exist!"%(listname_))
       self.refreshable = True
   for path in self.paths:
     if path not in self.pathfiles: # nonexistent list
       LOG.warning("loadfiles: Did not find any files for path %s in %s!"%(path,listname))
   if self.nevents<=0:
     self.nevents = nevents
   elif self.nevents!=nevents:
     LOG.warning("loadfiles: stored nevents=%d does not match the sum total of file events, %d!"%(self.nevents,nevents))
     self.nevents == nevents
   self.files = filelist
   self.files.sort()
   return self.files