def hasLHESource(self): """ Returns a tuple containing a bool to indicate usage of an LHESource and an integer for the number of input files. """ if bootstrapDone(): self.logger.debug("Getting lhe info from bootstrap cachefile.") info = self.getCfgInfo() return info['lheinfo'] isLHE, numFiles = False, 0 if getattr(self.fullConfig.process, 'source'): source = self.fullConfig.process.source try: isLHE = str(source.type_()) == 'LHESource' except AttributeError as ex: msg = "\nInvalid CMSSW configuration: Failed to check if 'process.source' is of type 'LHESource': %s" % ( ex) raise ConfigurationException(msg) if isLHE: if hasattr(source, 'fileNames'): numFiles = len(source.fileNames) else: msg = "\nInvalid CMSSW configuration: Object 'process.source', of type 'LHESource', is missing attribute 'fileNames'." raise ConfigurationException(msg) return isLHE, numFiles
def hasLHESource(self): """ Returns a tuple containing a bool to indicate usage of an LHESource and an integer for the number of input files. """ if bootstrapDone(): self.logger.debug("Getting lhe info from bootstrap cachefile.") info = self.getCfgInfo() return info['lheinfo'] isLHE, numFiles = False, 0 if getattr(self.fullConfig.process, 'source'): source = self.fullConfig.process.source try: isLHE = str(source.type_()) == 'LHESource' except AttributeError as ex: msg = "Invalid CMSSW configuration: Failed to check if 'process.source' is of type 'LHESource': %s" % (ex) raise ConfigurationException(msg) if isLHE: if hasattr(source, 'fileNames'): numFiles = len(source.fileNames) else: msg = "Invalid CMSSW configuration: Object 'process.source', of type 'LHESource', is missing attribute 'fileNames'." raise ConfigurationException(msg) return isLHE, numFiles
def outputFiles(self): """ Returns a tuple of lists of output files. First element is PoolOutput files, second is TFileService files. """ if bootstrapDone(): self.logger.debug("Getting output files from bootstrap cachefile.") info = self.getCfgInfo() return info['outfiles'] edmfiles = [] process = self.fullConfig.process #determine all paths/endpaths which will run pathsToRun = set() if process.schedule is not None: for p in process.schedule: pathsToRun.add(p.label()) #determine all modules on EndPaths modulesOnEndPaths = set() for m in process.endpaths_().itervalues(): if len(pathsToRun) == 0 or m.label() in pathsToRun: for n in m.moduleNames(): modulesOnEndPaths.add(n) outputModules = set() for n, o in process.outputModules_().iteritems(): if n in modulesOnEndPaths and hasattr(o, 'fileName'): edmfile = re.sub(r'^file:', '', o.fileName.value()) edmfile = os.path.basename(edmfile) edmfiles.append(edmfile) outputModules.add(o) ## If there are multiple output modules, make sure they have dataset.filterName set. if len(outputModules) > 1: for outputModule in outputModules: try: dataset = getattr(outputModule, 'dataset') filterName = getattr(dataset, 'filterName') except AttributeError: raise RuntimeError( '\nYour output module %s does not have a "dataset" PSet ' % outputModule.label() + 'or the PSet does not have a "filterName" member.') ## Find files written by TFileService. tfiles = [] if 'TFileService' in process.services: tFileService = process.services['TFileService'] if "fileName" in tFileService.parameterNames_(): tfile = re.sub(r'^file:', '', getattr(tFileService, 'fileName').value()) tfile = os.path.basename(tfile) tfiles.append(tfile) return edmfiles, tfiles
def outputFiles(self): """ Returns a tuple of lists of output files. First element is PoolOutput files, second is TFileService files. """ if bootstrapDone(): self.logger.debug("Getting output files from bootstrap cachefile.") info = self.getCfgInfo() return info["outfiles"] edmfiles = [] process = self.fullConfig.process # determine all paths/endpaths which will run pathsToRun = set() if process.schedule is not None: for p in process.schedule: pathsToRun.add(p.label()) # determine all modules on EndPaths modulesOnEndPaths = set() for m in process.endpaths_().itervalues(): if len(pathsToRun) == 0 or m.label() in pathsToRun: for n in m.moduleNames(): modulesOnEndPaths.add(n) outputModules = set() for n, o in process.outputModules_().iteritems(): if n in modulesOnEndPaths and hasattr(o, "fileName"): edmfile = re.sub(r"^file:", "", o.fileName.value()) edmfile = os.path.basename(edmfile) edmfiles.append(edmfile) outputModules.add(o) ## If there are multiple output modules, make sure they have dataset.filterName set. if len(outputModules) > 1: for outputModule in outputModules: try: dataset = getattr(outputModule, "dataset") filterName = getattr(dataset, "filterName") except AttributeError: raise RuntimeError( 'Your output module %s does not have a "dataset" PSet ' % outputModule.label() + 'or the PSet does not have a "filterName" member.' ) ## Find files written by TFileService. tfiles = [] if "TFileService" in process.services: tFileService = process.services["TFileService"] if "fileName" in tFileService.parameterNames_(): tfile = re.sub(r"^file:", "", getattr(tFileService, "fileName").value()) tfile = os.path.basename(tfile) tfiles.append(tfile) return edmfiles, tfiles
def __init__(self, logger=None): self.logger = logger if logger else logging if bootstrapDone(): self.logger.debug("Loading required information from the bootstrap environment file") try: self.initFromFile() except EnvironmentException, ee: self.logger.info(str(ee)) self.logger.info("Will try to find the necessary information from the environment") self.initFromEnv()
def __init__(self, logger=None): self.logger = logger if logger else logging if bootstrapDone(): self.logger.debug("Loading required information from the bootstrap environment file") try: self.initFromFile() except EnvironmentException as ee: self.logger.info(str(ee)) self.logger.info("Will try to find the necessary information from the environment") self.initFromEnv() else: self.logger.debug("Loading required information from the environment") self.initFromEnv() self.logger.debug("Found %s for %s with base %s" % (self.getCmsswVersion(), self.getScramArch(), self.getCmsswBase()))
def __init__(self, config, userConfig=None, logger=None): global configurationCache self.config = config self.logger = logger if logger else logging self.fullConfig = None self.outputFile = None if userConfig: cfgBaseName = os.path.basename(userConfig).replace(".py", "") cfgDirName = os.path.dirname(os.path.abspath(userConfig)) if not os.path.isfile(userConfig): msg = "Cannot find CMSSW configuration file %s in %s" % ( userConfig, os.getcwd()) raise ConfigurationException(msg) self.logger.info("Importing CMSSW configuration %s" % (userConfig)) pyCfgParams = getattr(self.config.JobType, 'pyCfgParams', []) originalArgv = sys.argv sys.argv = [userConfig] if pyCfgParams: sys.argv.extend(pyCfgParams) msg = "Additional parameters for the CMSSW configuration are: %s" % ( pyCfgParams) self.logger.debug(msg) configFile, pathname, description = imp.find_module( cfgBaseName, [cfgDirName]) cacheLine = (tuple(sys.path), tuple(pathname), tuple(sys.argv)) if cacheLine in configurationCache: self.fullConfig = configurationCache[cacheLine] configFile.close() elif not bootstrapDone(): sys.path.append(os.getcwd()) try: oldstdout = sys.stdout sys.stdout = open(logger.logfile, 'a') self.fullConfig = imp.load_module(cfgBaseName, configFile, pathname, description) finally: sys.stdout.close() sys.stdout = oldstdout configFile.close() configurationCache[cacheLine] = self.fullConfig self.logger.info("Finished importing CMSSW configuration %s" % (userConfig)) sys.argv = originalArgv
def __init__(self, config, userConfig=None, logger=None): global configurationCache self.config = config self.logger = logger if logger else logging self.fullConfig = None self.outputFile = None if userConfig: cfgBaseName = os.path.basename(userConfig).replace(".py", "") cfgDirName = os.path.dirname(os.path.abspath(userConfig)) if not os.path.isfile(userConfig): msg = "Cannot find CMSSW configuration file %s in %s" % (userConfig, os.getcwd()) raise ConfigurationException(msg) self.logger.info("Importing CMSSW configuration %s" % (userConfig)) pyCfgParams = getattr(self.config.JobType, 'pyCfgParams', []) originalArgv = sys.argv sys.argv = [userConfig] if pyCfgParams: sys.argv.extend(pyCfgParams) msg = "Additional parameters for the CMSSW configuration are: %s" % (pyCfgParams) self.logger.debug(msg) configFile, pathname, description = imp.find_module(cfgBaseName, [cfgDirName]) cacheLine = (tuple(sys.path), tuple(pathname), tuple(sys.argv)) if cacheLine in configurationCache: self.fullConfig = configurationCache[cacheLine] configFile.close() elif not bootstrapDone(): sys.path.append(os.getcwd()) try: oldstdout = sys.stdout sys.stdout = open(logger.logfile, 'a') self.fullConfig = imp.load_module(cfgBaseName, configFile, pathname, description) finally: sys.stdout.close() sys.stdout = oldstdout configFile.close() configurationCache[cacheLine] = self.fullConfig self.logger.info("Finished importing CMSSW configuration %s" % (userConfig)) sys.argv = originalArgv
def hasLHESource(self): """ Returns a tuple containing a bool to indicate usage of an LHESource and an integer for the number of input files. """ if bootstrapDone(): self.logger.debug("Getting lhe info from bootstrap cachefile.") info = self.getCfgInfo() return info['lheinfo'] lhe = False files = 0 if hasattr(self.fullConfig.process, 'source'): source = self.fullConfig.process.source lhe = str(source.type_()) == 'LHESource' files = len(source.fileNames) if lhe else 0 return lhe, files
def hasPoolSource(self): """ Returns if an PoolSource is present in the parameter set. """ if bootstrapDone(): self.logger.debug("Getting source info from bootstrap cachefile.") info = self.getCfgInfo() return info['poolinfo'] isPool = False if getattr(self.fullConfig.process, 'source'): source = self.fullConfig.process.source try: isPool = str(source.type_()) == 'PoolSource' except AttributeError as ex: msg = "Invalid CMSSW configuration: Failed to check if 'process.source' is of type 'PoolSource': %s" % (ex) raise ConfigurationException(msg) return isPool
def __init__(self, logger=None): self.logger = logger if logger else logging if bootstrapDone(): self.logger.debug( "Loading required information from the bootstrap environment file" ) try: self.initFromFile() except EnvironmentException as ee: self.logger.info(str(ee)) self.logger.info( "Will try to find the necessary information from the environment" ) self.initFromEnv() else: self.logger.debug( "Loading required information from the environment") self.initFromEnv() self.logger.debug( "Found %s for %s with base %s" % (self.getCmsswVersion(), self.getScramArch(), self.getCmsswBase()))
def run(self, filecacheurl = None): """ Override run() for JobType """ configArguments = {'addoutputfiles' : [], 'adduserfiles' : [], 'tfileoutfiles' : [], 'edmoutfiles' : [], } if getattr(self.config.Data, 'useParent', False) and getattr(self.config.Data, 'secondaryInputDataset', None): msg = "Invalid CRAB configuration: Parameters Data.useParent and Data.secondaryInputDataset cannot be used together." raise ConfigurationException(msg) # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({'jobarch': scram.getScramArch(), 'jobsw': scram.getCmsswVersion()}) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID + 'default.tgz') debugTarFilename = os.path.join(self.workdir, 'debugFiles.tgz') cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE) else: raise EnvironmentException('Problem with uuidgen while preparing for Sandbox upload.') else: _, tarFilename = tempfile.mkstemp(suffix='.tgz') _, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset ## Create CMSSW config. self.logger.debug("self.config: %s" % (self.config)) self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName)) ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent ## in the sense that a second loading of the same pset may not produce the same ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW ## pset twice. However, some "complicated" psets seem to evade the caching. ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that ## it can be reused later if wanted (for example, in PrivateMC when checking if ## the pset has an LHE source) instead of having to load the pset again. ## As for what does "complicated" psets mean, Daniel Riley said that there are ## some psets where one module modifies the configuration from another module. self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## If there is a CMSSW pset, do a basic validation of it. if not bootstrapDone() and self.config.JobType.psetName: valid, msg = self.cmsswCfg.validateConfig() if not valid: raise ConfigurationException(msg) ## We need to put the pickled CMSSW configuration in the right place. ## Here, we determine if the bootstrap script already run and prepared everything ## for us. In such case we move the file, otherwise we pickle.dump the pset if not bootstrapDone(): # Write out CMSSW config self.cmsswCfg.writeFile(cfgOutputName) else: # Move the pickled and the configuration files created by the bootstrap script self.moveCfgFile(cfgOutputName) ## Interrogate the CMSSW pset for output files (only output files produced by ## PoolOutputModule or TFileService are identified automatically). Do this ## automatic detection even if JobType.disableAutomaticOutputCollection = True, ## so that we can still classify the output files in EDM, TFile and additional ## output files in the Task DB (and the job ad). ## TODO: Do we really need this classification at all? cmscp and PostJob read ## the FJR to know if an output file is EDM, TFile or other. edmfiles, tfiles = self.cmsswCfg.outputFiles() ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile ## output files that are not listed in JobType.outputFiles. if getattr(self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue('JobType.disableAutomaticOutputCollection')): outputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', [])] edmfiles = [file for file in edmfiles if file in outputFiles] tfiles = [file for file in tfiles if file in outputFiles] ## Get the list of additional output files that have to be collected as given ## in JobType.outputFiles, but remove duplicates listed already as EDM files or ## TFiles. addoutputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles+tfiles] self.logger.debug("The following EDM output files will be collected: %s" % edmfiles) self.logger.debug("The following TFile output files will be collected: %s" % tfiles) self.logger.debug("The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) ## Give warning message in case no output file was detected in the CMSSW pset ## nor was any specified in the CRAB configuration. if not configArguments['edmoutfiles'] and not configArguments['tfileoutfiles'] and not configArguments['addoutputfiles']: msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) if getattr(self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue('JobType.disableAutomaticOutputCollection')): msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration" msg += " and no output file was explicitly specified in the CRAB configuration." else: msg += " CRAB could not detect any output file in the CMSSW configuration" msg += " nor was any explicitly specified in the CRAB configuration." msg += " Hence CRAB will not collect any output file from this task." self.logger.warning(msg) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', [])] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [os.path.basename(f) for f in inputFiles] try: uploadResult = tb.upload(filecacheurl = filecacheurl) except HTTPException as hte: if 'X-Error-Info' in hte.headers: reason = hte.headers['X-Error-Info'] reason_re = re.compile(r'\AFile size is ([0-9]*)B\. This is bigger than the maximum allowed size of ([0-9]*)B\.$') re_match = reason_re.match(reason) if re_match: ISBSize = int(re_match.group(1)) ISBSizeLimit = int(re_match.group(2)) reason = "%sError%s:" % (colors.RED, colors.NORMAL) reason += " Input sanbox size is ~%sMB. This is bigger than the maximum allowed size of %sMB." % (ISBSize/1024/1024, ISBSizeLimit/1024/1024) ISBContent = sorted(tb.content, reverse=True) biggestFileSize = ISBContent[0][0] ndigits = int(math.ceil(math.log(biggestFileSize+1, 10))) reason += "\nInput sanbox content sorted by size[Bytes]:" for (size, name) in ISBContent: reason += ("\n%" + str(ndigits) + "s\t%s") % (size, name) raise ClientException(reason) raise hte except Exception as e: msg = ("Impossible to calculate the checksum of the sandbox tarball.\nError message: %s.\n" "More details can be found in %s" % (e, self.logger.logfile)) LOGGERS['CRAB3'].exception(msg) #the traceback is only printed into the logfile raise ClientException(msg) debugFilesUploadResult = None with UserTarball(name=debugTarFilename, logger=self.logger, config=self.config) as dtb: dtb.addMonFiles() try: debugFilesUploadResult = dtb.upload(filecacheurl = filecacheurl) except Exception as e: msg = ("Problem uploading debug_files.tar.gz.\nError message: %s.\n" "More details can be found in %s" % (e, self.logger.logfile)) LOGGERS['CRAB3'].exception(msg) #the traceback is only printed into the logfile configArguments['cacheurl'] = filecacheurl configArguments['cachefilename'] = "%s.tar.gz" % uploadResult if debugFilesUploadResult is not None: configArguments['debugfilename'] = "%s.tar.gz" % debugFilesUploadResult self.logger.debug("Result uploading input files: %(cachefilename)s " % configArguments) # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, 'userInputFiles', None) if userFilesList: self.logger.debug("Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments['userfiles'] = set(userFilesList) configArguments['primarydataset'] = getattr(self.config.Data, 'outputPrimaryDataset', 'CRAB_UserFiles') lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) try: lumi_list = getLumiList(lumi_mask_name, logger = self.logger) except ValueError as ex: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex) raise ConfigurationException(msg) run_ranges = getattr(self.config.Data, 'runRange', None) if run_ranges: run_ranges_is_valid = re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) if not lumi_list: msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null." raise ConfigurationException(msg) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str(len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs = run_list) else: msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'" raise ConfigurationException(msg) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [str(reduce(lambda x,y: x+y, lumi_mask[run]))[1:-1].replace(' ','') for run in configArguments['runs']] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments
def run(self, filecacheurl=None): """ Override run() for JobType """ configArguments = { 'addoutputfiles': [], 'adduserfiles': [], 'tfileoutfiles': [], 'edmoutfiles': [], } if getattr(self.config.Data, 'useParent', False) and getattr( self.config.Data, 'secondaryInputDataset', None): msg = "Invalid CRAB configuration: Parameters Data.useParent and Data.secondaryInputDataset cannot be used together." raise ConfigurationException(msg) # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({ 'jobarch': scram.getScramArch(), 'jobsw': scram.getCmsswVersion() }) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID + 'default.tgz') cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE) else: raise EnvironmentException( 'Problem with uuidgen while preparing for Sandbox upload.') else: _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz') _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset ## Create CMSSW config. self.logger.debug("self.config: %s" % (self.config)) self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName)) ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent ## in the sense that a second loading of the same pset may not produce the same ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW ## pset twice. However, some "complicated" psets seem to evade the caching. ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that ## it can be reused later if wanted (for example, in PrivateMC when checking if ## the pset has an LHE source) instead of having to load the pset again. ## As for what does "complicated" psets mean, Daniel Riley said that there are ## some psets where one module modifies the configuration from another module. self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## If there is a CMSSW pset, do a basic validation of it. if not bootstrapDone() and self.config.JobType.psetName: valid, msg = self.cmsswCfg.validateConfig() if not valid: raise ConfigurationException(msg) ## We need to put the pickled CMSSW configuration in the right place. ## Here, we determine if the bootstrap script already run and prepared everything ## for us. In such case we move the file, otherwise we pickle.dump the pset if not bootstrapDone(): # Write out CMSSW config self.cmsswCfg.writeFile(cfgOutputName) else: # Move the pickled and the configuration files created by the bootstrap script self.moveCfgFile(cfgOutputName) ## Interrogate the CMSSW pset for output files (only output files produced by ## PoolOutputModule or TFileService are identified automatically). Do this ## automatic detection even if JobType.disableAutomaticOutputCollection = True, ## so that we can still classify the output files in EDM, TFile and additional ## output files in the Task DB (and the job ad). ## TODO: Do we really need this classification at all? cmscp and PostJob read ## the FJR to know if an output file is EDM, TFile or other. edmfiles, tfiles = self.cmsswCfg.outputFiles() ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile ## output files that are not listed in JobType.outputFiles. if getattr( self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue( 'JobType.disableAutomaticOutputCollection')): outputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) ] edmfiles = [file for file in edmfiles if file in outputFiles] tfiles = [file for file in tfiles if file in outputFiles] ## Get the list of additional output files that have to be collected as given ## in JobType.outputFiles, but remove duplicates listed already as EDM files or ## TFiles. addoutputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles + tfiles ] self.logger.debug( "The following EDM output files will be collected: %s" % edmfiles) self.logger.debug( "The following TFile output files will be collected: %s" % tfiles) self.logger.debug( "The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) ## Give warning message in case no output file was detected in the CMSSW pset ## nor was any specified in the CRAB configuration. if not configArguments['edmoutfiles'] and not configArguments[ 'tfileoutfiles'] and not configArguments['addoutputfiles']: msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) if getattr( self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue( 'JobType.disableAutomaticOutputCollection')): msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration" msg += " and no output file was explicitly specified in the CRAB configuration." else: msg += " CRAB could not detect any output file in the CMSSW configuration" msg += " nor was any explicitly specified in the CRAB configuration." msg += " Hence CRAB will not collect any output file from this task." self.logger.warning(msg) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', []) ] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [ os.path.basename(f) for f in inputFiles ] try: uploadResult = tb.upload(filecacheurl=filecacheurl) except HTTPException as hte: if 'X-Error-Info' in hte.headers: reason = hte.headers['X-Error-Info'] reason_re = re.compile( r'\AFile size is ([0-9]*)B\. This is bigger than the maximum allowed size of ([0-9]*)B\.$' ) re_match = reason_re.match(reason) if re_match: ISBSize = int(re_match.group(1)) ISBSizeLimit = int(re_match.group(2)) reason = "%sError%s:" % (colors.RED, colors.NORMAL) reason += " Input sanbox size is ~%sMB. This is bigger than the maximum allowed size of %sMB." % ( ISBSize / 1024 / 1024, ISBSizeLimit / 1024 / 1024) ISBContent = sorted(tb.content, reverse=True) biggestFileSize = ISBContent[0][0] ndigits = int( math.ceil(math.log(biggestFileSize + 1, 10))) reason += "\nInput sanbox content sorted by size[Bytes]:" for (size, name) in ISBContent: reason += ("\n%" + str(ndigits) + "s\t%s") % (size, name) raise ClientException(reason) raise hte except Exception as e: msg = ( "Impossible to calculate the checksum of the sandbox tarball.\nError message: %s.\n" "More details can be found in %s" % (e, self.logger.logfile)) LOGGERS['CRAB3'].exception( msg) #the traceback is only printed into the logfile raise ClientException(msg) configArguments['cacheurl'] = filecacheurl configArguments['cachefilename'] = "%s.tar.gz" % uploadResult self.logger.debug("Result uploading input files: %(cachefilename)s " % configArguments) # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, 'userInputFiles', None) if userFilesList: self.logger.debug( "Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments['userfiles'] = set(userFilesList) configArguments['primarydataset'] = getattr( self.config.Data, 'outputPrimaryDataset', 'CRAB_UserFiles') lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) try: lumi_list = getLumiList(lumi_mask_name, logger=self.logger) except ValueError as ex: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex) raise ConfigurationException(msg) run_ranges = getattr(self.config.Data, 'runRange', None) if run_ranges: run_ranges_is_valid = re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) if not lumi_list: msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null." raise ConfigurationException(msg) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str( len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs=run_list) else: msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'" raise ConfigurationException(msg) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [ str(reduce(lambda x, y: x + y, lumi_mask[run]))[1:-1].replace(' ', '') for run in configArguments['runs'] ] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments
def run(self, filecacheurl=None): """ Override run() for JobType """ configArguments = {"addoutputfiles": [], "adduserfiles": [], "tfileoutfiles": [], "edmoutfiles": []} # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({"jobarch": scram.getScramArch(), "jobsw": scram.getCmsswVersion()}) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug("UNIQUE NAME: tarUUID %s " % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID + "default.tgz") cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE) else: raise EnvironmentException("Problem with uuidgen while preparing for Sandbox upload.") else: _dummy, tarFilename = tempfile.mkstemp(suffix=".tgz") _dummy, cfgOutputName = tempfile.mkstemp(suffix="_cfg.py") if getattr(self.config.Data, "inputDataset", None): configArguments["inputdata"] = self.config.Data.inputDataset ## Create CMSSW config. self.logger.debug("self.config: %s" % (self.config)) self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName)) ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent ## in the sense that a second loading of the same pset may not produce the same ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW ## pset twice. However, some "complicated" psets seem to evade the caching. ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that ## it can be reused later if wanted (for example, in PrivateMC when checking if ## the pset has an LHE source) instead of having to load the pset again. ## As for what does "complicated" psets mean, Daniel Riley said that there are ## some psets where one module modifies the configuration from another module. self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## If there is a CMSSW pset, do a basic validation of it. if not bootstrapDone() and self.config.JobType.psetName: valid, msg = self.cmsswCfg.validateConfig() if not valid: raise ConfigurationException(msg) ## We need to put the pickled CMSSW configuration in the right place. ## Here, we determine if the bootstrap script already run and prepared everything ## for us. In such case we move the file, otherwise we pickle.dump the pset if not bootstrapDone(): # Write out CMSSW config self.cmsswCfg.writeFile(cfgOutputName) else: # Move the pickled configuration file created by the bootstrap script self.moveCfgFile(cfgOutputName) ## Interrogate the CMSSW pset for output files (only output files produced by ## PoolOutputModule or TFileService are identified automatically). Do this ## automatic detection even if JobType.disableAutomaticOutputCollection = True, ## so that we can still classify the output files in EDM, TFile and additional ## output files in the Task DB (and the job ad). ## TODO: Do we really need this classification at all? cmscp and PostJob read ## the FJR to know if an output file is EDM, TFile or other. edmfiles, tfiles = self.cmsswCfg.outputFiles() ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile ## output files that are not listed in JobType.outputFiles. if getattr( self.config.JobType, "disableAutomaticOutputCollection", getParamDefaultValue("JobType.disableAutomaticOutputCollection"), ): outputFiles = [re.sub(r"^file:", "", file) for file in getattr(self.config.JobType, "outputFiles", [])] edmfiles = [file for file in edmfiles if file in outputFiles] tfiles = [file for file in tfiles if file in outputFiles] ## Get the list of additional output files that have to be collected as given ## in JobType.outputFiles, but remove duplicates listed already as EDM files or ## TFiles. addoutputFiles = [ re.sub(r"^file:", "", file) for file in getattr(self.config.JobType, "outputFiles", []) if re.sub(r"^file:", "", file) not in edmfiles + tfiles ] self.logger.debug("The following EDM output files will be collected: %s" % edmfiles) self.logger.debug("The following TFile output files will be collected: %s" % tfiles) self.logger.debug("The following user output files will be collected: %s" % addoutputFiles) configArguments["edmoutfiles"] = edmfiles configArguments["tfileoutfiles"] = tfiles configArguments["addoutputfiles"].extend(addoutputFiles) ## Give warning message in case no output file was detected in the CMSSW pset ## nor was any specified in the CRAB configuration. if ( not configArguments["edmoutfiles"] and not configArguments["tfileoutfiles"] and not configArguments["addoutputfiles"] ): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) if getattr( self.config.JobType, "disableAutomaticOutputCollection", getParamDefaultValue("JobType.disableAutomaticOutputCollection"), ): msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration" msg += " and no output file was explicitly specified in the CRAB configuration." else: msg += " CRAB could not detect any output file in the CMSSW configuration" msg += " nor was any explicitly specified in the CRAB configuration." msg += " Hence CRAB will not collect any output file from this task." self.logger.warning(msg) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [re.sub(r"^file:", "", file) for file in getattr(self.config.JobType, "inputFiles", [])] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments["adduserfiles"] = [os.path.basename(f) for f in inputFiles] uploadResults = tb.upload(filecacheurl=filecacheurl) self.logger.debug("Result uploading input files: %s " % str(uploadResults)) configArguments["cacheurl"] = filecacheurl configArguments["cachefilename"] = uploadResults[0] isbchecksum = uploadResults[1] # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, "userInputFiles", None) if userFilesList: self.logger.debug("Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments["userfiles"] = set(userFilesList) ## Get the user-specified primary dataset name. primaryDataset = getattr(self.config.Data, "primaryDataset", "CRAB_UserFiles") # Normalizes "foo/bar" and "/foo/bar" to "/foo/bar" primaryDataset = "/" + os.path.join(*primaryDataset.split("/")) if not re.match("/%(primDS)s.*" % (lfnParts), primaryDataset): self.logger.warning("Invalid primary dataset name %s; publication may fail." % (primaryDataset)) configArguments["inputdata"] = primaryDataset lumi_mask_name = getattr(self.config.Data, "lumiMask", None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) try: lumi_list = getLumiList(lumi_mask_name, logger=self.logger) except ValueError as ex: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex) raise ConfigurationException(msg) run_ranges = getattr(self.config.Data, "runRange", None) if run_ranges: run_ranges_is_valid = re.match("^\d+((?!(-\d+-))(\,|\-)\d+)*$", run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) if not lumi_list: msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null." raise ConfigurationException(msg) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str(len(run_list)) msg += ( " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." ) raise ConfigurationException(msg) lumi_list = LumiList(runs=run_list) else: msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'" raise ConfigurationException(msg) if lumi_list: configArguments["runs"] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments["lumis"] = [ str(reduce(lambda x, y: x + y, lumi_mask[run]))[1:-1].replace(" ", "") for run in configArguments["runs"] ] configArguments["jobtype"] = "Analysis" return tarFilename, configArguments, isbchecksum