def validateConfig(self, config): """ Validate the PrivateMC portion of the config file making sure required values are there and optional values don't conflict. Subclass to CMSSW for most of the work """ valid, reason = self.validateBasicConfig(config) if not valid: return valid, reason ## If publication is True, check that there is a primary dataset name specified. if getattr(config.Data, 'publication', getParamDefaultValue('Data.publication')): if not getattr(config.Data, 'outputPrimaryDataset', getParamDefaultValue('Data.outputPrimaryDataset')): msg = "Invalid CRAB configuration: Parameter Data.outputPrimaryDataset not specified." msg += "\nMC generation job type requires this parameter for publication." return False, msg if not hasattr(config.Data, 'totalUnits'): msg = "Invalid CRAB configuration: Parameter Data.totalUnits not specified." msg += "\nMC generation job type requires this parameter to know how many events to generate." return False, msg elif config.Data.totalUnits <= 0: msg = "Invalid CRAB configuration: Parameter Data.totalUnits has an invalid value (%s)." % (config.Data.totalUnits) msg += " It must be a natural number." return False, msg ## Make sure the splitting algorithm is valid. allowedSplitAlgos = ['EventBased'] if self.splitAlgo not in allowedSplitAlgos: msg = "Invalid CRAB configuration: Parameter Data.splitting has an invalid value ('%s')." % (self.splitAlgo) msg += "\nMC generation job type only supports the following splitting algorithms: %s." % (allowedSplitAlgos) return False, msg return True, "Valid configuration"
def serverInstance(self): """ Deriving the correct instance to use and the server url. Client is allowed to propagate the instance name and corresponding url via crabconfig.py or crab option --instance. The variable passed via crab option will always be used over the variable in crabconfig.py. Instance name other than specify in the SERVICE_INSTANCE will be treated as a private instance. """ if hasattr(self.options, 'instance') and self.options.instance is not None: if hasattr(self, 'configuration') and hasattr(self.configuration.General, 'instance') and self.configuration.General.instance is not None: msg = "%sWarning%s: CRAB configuration parameter General.instance is overwritten by the command option --instance;" % (colors.RED, colors.NORMAL) msg += " %s intance will be used." % (self.options.instance) self.logger.info(msg) if self.options.instance in SERVICE_INSTANCES.keys(): instance = self.options.instance serverurl = SERVICE_INSTANCES[instance] else: instance = 'private' serverurl = self.options.instance elif hasattr(self, 'configuration') and hasattr(self.configuration.General, 'instance') and self.configuration.General.instance is not None: if self.configuration.General.instance in SERVICE_INSTANCES.keys(): instance = self.configuration.General.instance serverurl = SERVICE_INSTANCES[instance] else: instance = 'private' serverurl = self.configuration.General.instance else: instance = getParamDefaultValue('General.instance') serverurl = SERVICE_INSTANCES[instance] return instance, serverurl
def validateConfig(self, config): """ Validate the CMSSW portion of the config file making sure required values are there and optional values don't conflict. """ valid, reason = self.validateBasicConfig(config) if not valid: return valid, reason ## Make sure only one of the two parameters Data.inputDataset and Data.userInputFiles ## was specified. if getattr(config.Data, 'inputDataset', None) and getattr(config.Data, 'userInputFiles', None): msg = "Invalid CRAB configuration: Analysis job type accepts either an input dataset or a set of user input files to run on, but not both." msg += "\nSuggestion: Specify only one of the two parameters, Data.inputDataset or Data.userInputFiles, but not both." return False, msg ## Make sure at least one of the two parameters Data.inputDataset and Data.userInputFiles ## was specified. if not getattr(config.Data, 'inputDataset', None) and not getattr(config.Data, 'userInputFiles', None): msg = "Invalid CRAB configuration: Analysis job type requires an input dataset or a set of user input files to run on." msg += "\nSuggestion: To specify an input dataset use the parameter Data.inputDataset." msg += " To specify a set of user input files use the parameter Data.userInputFiles." return False, msg ## When running over an input dataset, we don't accept that the user specifies a ## primary dataset, because the primary dataset will already be extracted from ## the input dataset. if getattr(config.Data, 'inputDataset', None) and getattr(config.Data, 'outputPrimaryDataset', None): msg = "Invalid CRAB configuration: Analysis job type with input dataset does not accept an output primary dataset name to be specified," msg += " because the later will be extracted from the first." msg += "\nSuggestion: Remove the parameter Data.outputPrimaryDataset." return False, msg ## When running over user input files with publication turned on, we want the ## user to specify the primary dataset to be used for publication. if getattr(config.Data, 'publication', getParamDefaultValue('Data.publication')): if not getattr(config.Data, 'inputDataset', None): if not getattr(config.Data, 'outputPrimaryDataset', None): msg = "Invalid CRAB configuration: Parameter Data.outputPrimaryDataset not specified." msg += "\nAnalysis job type without input dataset requires this parameter for publication." return False, msg ## When running over user input files, make sure the splitting mode is 'FileBased'. if getattr(config.Data, 'userInputFiles', None) and self.splitAlgo != 'FileBased': msg = "Invalid CRAB configuration: Analysis job type with user input files only supports file-based splitting." msg += "\nSuggestion: Set Data.splitting = 'FileBased'." return False, msg ## Make sure the splitting algorithm is valid. allowedSplitAlgos = ['FileBased', 'LumiBased', 'EventAwareLumiBased'] self.checkAutomaticAvail(allowedSplitAlgos) if self.splitAlgo not in allowedSplitAlgos: msg = "Invalid CRAB configuration: Parameter Data.splitting has an invalid value ('%s')." % (self.splitAlgo) msg += "\nAnalysis job type only supports the following splitting algorithms: %s." % (allowedSplitAlgos) return False, msg return True, "Valid configuration"
def validateConfig(self, config): """ Validate the PrivateMC portion of the config file making sure required values are there and optional values don't conflict. Subclass to CMSSW for most of the work """ valid, reason = self.validateBasicConfig(config) if not valid: return valid, reason ## Check that there is no input dataset specified. if getattr(config.Data, 'inputDataset', None): msg = "Invalid CRAB configuration: MC generation job type does not use an input dataset." msg += "\nIf you really intend to run over an input dataset, then you have to run an analysis job type (i.e. set JobType.pluginName = 'Analysis')." return False, msg ## If publication is True, check that there is a primary dataset name specified. if getattr(config.Data, 'publication', getParamDefaultValue('Data.publication')): if not hasattr(config.Data, 'primaryDataset'): msg = "Invalid CRAB configuration: Parameter Data.primaryDataset not specified." msg += "\nMC generation job type requires this parameter for publication." return False, msg if not hasattr(config.Data, 'totalUnits'): msg = "Invalid CRAB configuration: Parameter Data.totalUnits not specified." msg += "\nMC generation job type requires this parameter to know how many events to generate." return False, msg elif config.Data.totalUnits <= 0: msg = "Invalid CRAB configuration: Parameter Data.totalUnits has an invalid value (%s)." % (config.Data.totalUnits) msg += " It must be a natural number." return False, msg if self.splitAlgo != 'EventBased': msg = "Invalid CRAB configuration: MC generation job type only supports event-based splitting (i.e. Data.splitting = 'EventBased')." return False, msg return True, "Valid configuration"
def validateConfig(self): """ __validateConfig__ Checking if needed input parameters are there. Not all the commands require a configuration. """ ## Check that the configuration object has the sections we expect it to have. ## (WMCore already checks that attributes added to the configuration object are of type ConfigSection.) ## Even if not all configuration sections need to be there, we anyway request ## the user to add all the sections in the configuration file. if not hasattr(self.configuration, 'General'): msg = "Invalid CRAB configuration: Section 'General' is missing." return False, msg if not hasattr(self.configuration, 'JobType'): msg = "Invalid CRAB configuration: Section 'JobType' is missing." return False, msg if not hasattr(self.configuration, 'Data'): msg = "Invalid CRAB configuration: Section 'Data' is missing." return False, msg if not hasattr(self.configuration, 'Site'): msg = "Invalid CRAB configuration: Section 'Site' is missing." return False, msg ## Some parameters may have been renamed. Check here if the configuration file has an old ## parameter defined, and in that case tell the user what is the new parameter name. for old_param, new_param in renamedParams.iteritems(): if len(old_param.split('.')) != 2 or len(new_param['newParam'].split('.')) != 2: continue old_param_section, old_param_name = old_param.split('.') if hasattr(self.configuration, old_param_section) and hasattr(getattr(self.configuration, old_param_section), old_param_name): msg = "Invalid CRAB configuration: Parameter %s has been renamed to %s" % (old_param, new_param['newParam']) if new_param['version'] != None: msg += " starting from CRAB %s" % (new_param['version']) msg += "; please change your configuration file accordingly." return False, msg ## Check if there are unknown parameters (and try to suggest the correct parameter name). all_config_params = configParametersInfo.keys() SpellChecker.DICTIONARY = SpellChecker.train(all_config_params) for section in self.configuration.listSections_(): for attr in getattr(self.configuration, section).listSections_(): param = (section + '.' + attr) if not SpellChecker.is_correct(param): msg = "Invalid CRAB configuration: Parameter %s is not known." % (param) if SpellChecker.correct(param) != param: msg += " Maybe you mean %s?" % (SpellChecker.correct(param)) return False, msg ## Check that each parameter specified in the configuration file is of the ## type specified in the configuration map. ## Check that, if a parameter is a required one and it has no default value, ## then it must be specified in the configuration file. for paramName, paramInfo in configParametersInfo.iteritems(): requiredTypeName = paramInfo['type'] try: requiredType = getattr(types, requiredTypeName) except AttributeError: msg = "Invalid type %s specified in CRABClient configuration mapping for parameter %s." % (requiredTypeName, paramName) return False, msg attrs = paramName.split('.') obj = self.configuration while attrs and obj is not None: obj = getattr(obj, attrs.pop(0), None) if obj is not None: if not isinstance(obj, requiredType): msg = "Invalid CRAB configuration: Parameter %s requires a value of type %s (while a value of type %s was given)." \ % (paramName, str(requiredType), str(type(obj))) if paramName == "Data.userInputFiles": msg += "\nIn CRAB v3.3.14 the configuration parameter Data.userInputFiles has been modified to directly take a (python) list of primary input files." msg += " Previously it was taking the name of a local text file where the primary input files were listed." msg += " One can still use a text file and convert its content into a python list by doing Data.userInputFiles = list(open('my_list_of_files.txt'))" return False, msg elif requiredType == list: if not all(isinstance(arg, str) for arg in obj): msg = "Invalid CRAB configuration: Parameter %s has to be a list of strings." % (paramName) return False, msg elif getParamDefaultValue(paramName) is None and paramInfo['required']: msg = "Invalid CRAB configuration: Parameter %s is missing." % (paramName) return False, msg return True, "Valid configuration"
def run(self, filecacheurl = None): """ Override run() for JobType """ configArguments = {'addoutputfiles' : [], 'adduserfiles' : [], 'tfileoutfiles' : [], 'edmoutfiles' : [], } if getattr(self.config.Data, 'useParent', False) and getattr(self.config.Data, 'secondaryInputDataset', None): msg = "Invalid CRAB configuration: Parameters Data.useParent and Data.secondaryInputDataset cannot be used together." raise ConfigurationException(msg) # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({'jobarch': scram.getScramArch(), 'jobsw': scram.getCmsswVersion()}) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID + 'default.tgz') debugTarFilename = os.path.join(self.workdir, 'debugFiles.tgz') cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE) else: raise EnvironmentException('Problem with uuidgen while preparing for Sandbox upload.') else: _, tarFilename = tempfile.mkstemp(suffix='.tgz') _, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset ## Create CMSSW config. self.logger.debug("self.config: %s" % (self.config)) self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName)) ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent ## in the sense that a second loading of the same pset may not produce the same ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW ## pset twice. However, some "complicated" psets seem to evade the caching. ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that ## it can be reused later if wanted (for example, in PrivateMC when checking if ## the pset has an LHE source) instead of having to load the pset again. ## As for what does "complicated" psets mean, Daniel Riley said that there are ## some psets where one module modifies the configuration from another module. self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## If there is a CMSSW pset, do a basic validation of it. if not bootstrapDone() and self.config.JobType.psetName: valid, msg = self.cmsswCfg.validateConfig() if not valid: raise ConfigurationException(msg) ## We need to put the pickled CMSSW configuration in the right place. ## Here, we determine if the bootstrap script already run and prepared everything ## for us. In such case we move the file, otherwise we pickle.dump the pset if not bootstrapDone(): # Write out CMSSW config self.cmsswCfg.writeFile(cfgOutputName) else: # Move the pickled and the configuration files created by the bootstrap script self.moveCfgFile(cfgOutputName) ## Interrogate the CMSSW pset for output files (only output files produced by ## PoolOutputModule or TFileService are identified automatically). Do this ## automatic detection even if JobType.disableAutomaticOutputCollection = True, ## so that we can still classify the output files in EDM, TFile and additional ## output files in the Task DB (and the job ad). ## TODO: Do we really need this classification at all? cmscp and PostJob read ## the FJR to know if an output file is EDM, TFile or other. edmfiles, tfiles = self.cmsswCfg.outputFiles() ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile ## output files that are not listed in JobType.outputFiles. if getattr(self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue('JobType.disableAutomaticOutputCollection')): outputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', [])] edmfiles = [file for file in edmfiles if file in outputFiles] tfiles = [file for file in tfiles if file in outputFiles] ## Get the list of additional output files that have to be collected as given ## in JobType.outputFiles, but remove duplicates listed already as EDM files or ## TFiles. addoutputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles+tfiles] self.logger.debug("The following EDM output files will be collected: %s" % edmfiles) self.logger.debug("The following TFile output files will be collected: %s" % tfiles) self.logger.debug("The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) ## Give warning message in case no output file was detected in the CMSSW pset ## nor was any specified in the CRAB configuration. if not configArguments['edmoutfiles'] and not configArguments['tfileoutfiles'] and not configArguments['addoutputfiles']: msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) if getattr(self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue('JobType.disableAutomaticOutputCollection')): msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration" msg += " and no output file was explicitly specified in the CRAB configuration." else: msg += " CRAB could not detect any output file in the CMSSW configuration" msg += " nor was any explicitly specified in the CRAB configuration." msg += " Hence CRAB will not collect any output file from this task." self.logger.warning(msg) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', [])] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [os.path.basename(f) for f in inputFiles] try: uploadResult = tb.upload(filecacheurl = filecacheurl) except HTTPException as hte: if 'X-Error-Info' in hte.headers: reason = hte.headers['X-Error-Info'] reason_re = re.compile(r'\AFile size is ([0-9]*)B\. This is bigger than the maximum allowed size of ([0-9]*)B\.$') re_match = reason_re.match(reason) if re_match: ISBSize = int(re_match.group(1)) ISBSizeLimit = int(re_match.group(2)) reason = "%sError%s:" % (colors.RED, colors.NORMAL) reason += " Input sanbox size is ~%sMB. This is bigger than the maximum allowed size of %sMB." % (ISBSize/1024/1024, ISBSizeLimit/1024/1024) ISBContent = sorted(tb.content, reverse=True) biggestFileSize = ISBContent[0][0] ndigits = int(math.ceil(math.log(biggestFileSize+1, 10))) reason += "\nInput sanbox content sorted by size[Bytes]:" for (size, name) in ISBContent: reason += ("\n%" + str(ndigits) + "s\t%s") % (size, name) raise ClientException(reason) raise hte except Exception as e: msg = ("Impossible to calculate the checksum of the sandbox tarball.\nError message: %s.\n" "More details can be found in %s" % (e, self.logger.logfile)) LOGGERS['CRAB3'].exception(msg) #the traceback is only printed into the logfile raise ClientException(msg) debugFilesUploadResult = None with UserTarball(name=debugTarFilename, logger=self.logger, config=self.config) as dtb: dtb.addMonFiles() try: debugFilesUploadResult = dtb.upload(filecacheurl = filecacheurl) except Exception as e: msg = ("Problem uploading debug_files.tar.gz.\nError message: %s.\n" "More details can be found in %s" % (e, self.logger.logfile)) LOGGERS['CRAB3'].exception(msg) #the traceback is only printed into the logfile configArguments['cacheurl'] = filecacheurl configArguments['cachefilename'] = "%s.tar.gz" % uploadResult if debugFilesUploadResult is not None: configArguments['debugfilename'] = "%s.tar.gz" % debugFilesUploadResult self.logger.debug("Result uploading input files: %(cachefilename)s " % configArguments) # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, 'userInputFiles', None) if userFilesList: self.logger.debug("Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments['userfiles'] = set(userFilesList) configArguments['primarydataset'] = getattr(self.config.Data, 'outputPrimaryDataset', 'CRAB_UserFiles') lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) try: lumi_list = getLumiList(lumi_mask_name, logger = self.logger) except ValueError as ex: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex) raise ConfigurationException(msg) run_ranges = getattr(self.config.Data, 'runRange', None) if run_ranges: run_ranges_is_valid = re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) if not lumi_list: msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null." raise ConfigurationException(msg) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str(len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs = run_list) else: msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'" raise ConfigurationException(msg) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [str(reduce(lambda x,y: x+y, lumi_mask[run]))[1:-1].replace(' ','') for run in configArguments['runs']] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments
def validateConfig(self): """ __validateConfig__ Checking if needed input parameters are there """ valid, msg = SubCommand.validateConfig(self) if not valid: return False, msg ## Check that the configuration object has the sections we expect it to have. ## (WMCore already checks that attributes added to the configuration object are of type ConfigSection.) ## Even if not all configuration sections need to be there, we anyway request ## the user to add all the sections in the configuration file. if not hasattr(self.configuration, 'General'): msg = "CRAB configuration problem: Section 'General' is missing" return False, msg if not hasattr(self.configuration, 'JobType'): msg = "CRAB configuration problem: Section 'JobType' is missing" return False, msg if not hasattr(self.configuration, 'Data'): msg = "CRAB configuration problem: Section 'Data' is missing" return False, msg if not hasattr(self.configuration, 'Site'): msg = "CRAB configuration problem: Section 'Site' is missing" return False, msg ## Some parameters may have been renamed. Check here if the configuration file has an old ## parameter defined, and in that case tell the user what is the new parameter name. for old_param, new_param in renamed_params.iteritems(): if len(old_param.split('.')) != 2 or len(new_param.split('.')) != 2: continue old_param_section, old_param_name = old_param.split('.') if hasattr(self.configuration, old_param_section) and hasattr(getattr(self.configuration, old_param_section), old_param_name): msg = "CRAB configuration problem: Parameter %s has been renamed to %s; please change your configuration file accordingly" % (old_param, new_param) return False, msg ## Check that Data.unitsPerjob is specified. if hasattr(self.configuration.Data, 'unitsPerJob'): try: float(self.configuration.Data.unitsPerJob) except ValueError: msg = "CRAB configuration problem: Parameter Data.unitsPerJob must be a valid number, not %s" % self.configuration.Data.unitsPerJob return False, msg ## Check that JobType.pluginName and JobType.externalPluginFile are not both specified. if hasattr(self.configuration.JobType, 'pluginName') and hasattr(self.configuration.JobType, 'externalPluginFile'): msg = "CRAB configuration problem: Only one of JobType.pluginName or JobType.externalPluginFile parameters can be specified" pluginName_default = getParamDefaultValue('JobType.pluginName') if pluginName_default: msg += "\nIf neither JobType.pluginName nor JobType.externalPluginFile would be specified, the default JobType.pluginName = '%s' would be used" \ % pluginName_default return False, msg ## Load the external plugin or check that the crab plugin is valid. external_plugin_name = getattr(self.configuration.JobType, 'externalPluginFile', None) crab_plugin_name = getattr(self.configuration.JobType, 'pluginName', None) crab_job_types = {'ANALYSIS': None, 'PRIVATEMC': None} #getJobTypes() if external_plugin_name: addPlugin(external_plugin_name) # Do we need to do this here? if crab_plugin_name and upper(crab_plugin_name) not in crab_job_types: msg = "CRAB configuration problem: Parameter JobType.pluginName has an invalid value '%s'" % crab_plugin_name msg += "\nAllowed values are: %s" % ", ".join(['%s' % job_type for job_type in crab_job_types.keys()]) return False, msg ## Check that the particular combination (Data.publication = True, General.transferOutputs = False) is not specified. if hasattr(self.configuration.Data, 'publication') and hasattr(self.configuration.General, 'transferOutputs'): if self.configuration.Data.publication and not self.configuration.General.transferOutputs: msg = "CRAB configuration problem: Data.publication is on, but General.transferOutputs is off" msg += "\nPublication can not be performed if the output files are not transferred to a permanent storage" return False, msg ## Check that a storage site is specified if General.transferOutputs = True or General.transferLogs = True. if not hasattr(self.configuration.Site, 'storageSite'): if (hasattr(self.configuration.General, 'transferOutputs') and self.configuration.General.transferOutputs) or \ (hasattr(self.configuration.General, 'transferLogs') and self.configuration.General.transferLogs): msg = "CRAB configuration problem: Parameter Site.storageSite is missing" return False, msg ## If an input dataset and a DBS URL are specified, check that the DBS URL is a good one. ## Also, if the DBS URL is 'phys0x', check that the input dataset tier is USER. if hasattr(self.configuration.Data, 'inputDBS'): if hasattr(self.configuration.Data, 'inputDataset'): msg = None dbs_urls_aliases = DBSURLS['reader'].keys() dbs_urls = DBSURLS['reader'].values() if (self.configuration.Data.inputDBS not in dbs_urls_aliases) and (self.configuration.Data.inputDBS.rstrip('/') not in dbs_urls): msg = "CRAB configuration problem: Parameter Data.inputDBS has an invalid value '%s'" % self.configuration.Data.inputDBS msg += "\nAllowed values are: " msg += "\n ".join(["'%s' ('%s')" % (alias, url) for alias, url in DBSURLS['reader'].iteritems()]) local_dbs_urls_aliases = ['phys01', 'phys02', 'phys03'] local_dbs_urls = [DBSURLS['reader'][alias] for alias in local_dbs_urls_aliases if alias in DBSURLS['reader']] if self.configuration.Data.inputDBS in local_dbs_urls + local_dbs_urls_aliases: inputDataset_parts = self.configuration.Data.inputDataset.split('/') inputDataset_parts.pop(0) inputDataset_tier = inputDataset_parts[-1] if len(inputDataset_parts) == 3 else None user_data_tiers = ['USER'] if inputDataset_tier not in user_data_tiers: msg = "CRAB configuration problem: A local DBS instance '%s' was specified for reading an input dataset of tier %s" \ % (self.configuration.Data.inputDBS, inputDataset_tier) msg += "\nDatasets of tier different than %s must be read from the global DBS instance; this is, set Data.inputDBS = 'global'" \ % (", ".join(user_data_tiers[:-1]) + " or " + user_data_tiers[-1] if len(user_data_tiers) > 1 else user_data_tiers[0]) if msg: inputDBS_default = getParamDefaultValue('Data.inputDBS') if inputDBS_default: inputDBS_default, inputDBS_default_alias = self.getDBSURLAndAlias(inputDBS_default, 'reader') if inputDBS_default and inputDBS_default_alias: msg += "\nIf Data.inputDBS would not be specified, the default '%s' ('%s') would be used" % (inputDBS_default_alias, inputDBS_default) return False, msg ## If a publication DBS URL is specified and publication is ON, check that the DBS URL is a good one. if hasattr(self.configuration.Data, 'publishDBS'): publication_default = getParamDefaultValue('Data.publication') if getattr(self.configuration.Data, 'publication', publication_default): dbs_urls = DBSURLS['writer'].values() dbs_urls_aliases = DBSURLS['writer'].keys() if (self.configuration.Data.publishDBS not in dbs_urls_aliases) and (self.configuration.Data.publishDBS.rstrip('/') not in dbs_urls): msg = "CRAB configuration problem: Parameter Data.publishDBS has an invalid value '%s'" % self.configuration.Data.publishDBS msg += "\nAllowed values are: " msg += "\n ".join(["'%s' ('%s')" % (alias, url) for alias, url in DBSURLS['writer'].iteritems()]) publishDBS_default = getParamDefaultValue('Data.publishDBS') if publishDBS_default: publishDBS_default, publishDBS_default_alias = self.getDBSURLAndAlias(publishDBS_default, 'writer') if publishDBS_default and publishDBS_default_alias: msg += "\nIf Data.publishDBS would not be specified, the default '%s' ('%s') would be used" \ % (publishDBS_default_alias, publishDBS_default) return False, msg if hasattr(self.configuration.JobType, 'scriptExe'): if not os.path.isfile(self.configuration.JobType.scriptExe): msg = "Cannot find the file %s specified in the scriptExe configuration parameter" % self.configuration.JobType.scriptExe return False, msg return True, "Valid configuration"
def validateConfig(self): """ __validateConfig__ Checking if needed input parameters are there """ valid, msg = SubCommand.validateConfig(self) if not valid: return False, msg ## Check that Data.unitsPerjob is specified. if hasattr(self.configuration.Data, 'unitsPerJob'): try: float(self.configuration.Data.unitsPerJob) except ValueError: msg = "Invalid CRAB configuration: Parameter Data.unitsPerJob must be a valid number, not %s." % (self.configuration.Data.unitsPerJob) return False, msg ## Check that JobType.pluginName and JobType.externalPluginFile are not both specified. if hasattr(self.configuration.JobType, 'pluginName') and hasattr(self.configuration.JobType, 'externalPluginFile'): msg = "Invalid CRAB configuration: Only one of JobType.pluginName or JobType.externalPluginFile parameters can be specified." pluginName_default = getParamDefaultValue('JobType.pluginName') if pluginName_default: msg += "\nIf neither JobType.pluginName nor JobType.externalPluginFile would be specified," msg += " the default JobType.pluginName = '%s' would be used." % (pluginName_default) return False, msg ## Load the external plugin or check that the crab plugin is valid. external_plugin_name = getattr(self.configuration.JobType, 'externalPluginFile', None) crab_plugin_name = getattr(self.configuration.JobType, 'pluginName', None) crab_job_types = {'ANALYSIS': None, 'PRIVATEMC': None} #getJobTypes() if external_plugin_name: addPlugin(external_plugin_name) # Do we need to do this here? if crab_plugin_name: if upper(crab_plugin_name) not in crab_job_types: msg = "Invalid CRAB configuration: Parameter JobType.pluginName has an invalid value ('%s')." % (crab_plugin_name) msg += "\nAllowed values are: %s." % (", ".join(['%s' % job_type for job_type in crab_job_types.keys()])) return False, msg msg = "Will use CRAB %s plugin" % ("Analysis" if upper(crab_plugin_name) == 'ANALYSIS' else "PrivateMC") msg += " (i.e. will run %s job type)." % ("an analysis" if upper(crab_plugin_name) == 'ANALYSIS' else "a MC generation") self.logger.debug(msg) ## Check that the particular combination (Data.publication = True, General.transferOutputs = False) is not specified. if getattr(self.configuration.Data, 'publication', getParamDefaultValue('Data.publication')) and \ not getattr(self.configuration.General, 'transferOutputs', getParamDefaultValue('General.transferOutputs')): msg = "Invalid CRAB configuration: Data.publication is True, but General.transferOutputs is False." msg += "\nPublication can not be performed if the output files are not transferred to a permanent storage." return False, msg ## Check that a storage site is specified if General.transferOutputs = True or General.transferLogs = True. if not hasattr(self.configuration.Site, 'storageSite'): if getattr(self.configuration.General, 'transferLogs', getParamDefaultValue('General.transferLogs')) or \ getattr(self.configuration.General, 'transferOutputs', getParamDefaultValue('General.transferOutputs')): msg = "Invalid CRAB configuration: Parameter Site.storageSite is missing." return False, msg ## If an input dataset and a DBS URL are specified, check that the DBS URL is a good one. ## Also, if the DBS URL is 'phys0x', check that the input dataset tier is USER. if hasattr(self.configuration.Data, 'inputDBS'): if hasattr(self.configuration.Data, 'inputDataset'): msg = None dbs_urls_aliases = DBSURLS['reader'].keys() dbs_urls = DBSURLS['reader'].values() if (self.configuration.Data.inputDBS not in dbs_urls_aliases) and (self.configuration.Data.inputDBS.rstrip('/') not in dbs_urls): msg = "Invalid CRAB configuration: Parameter Data.inputDBS has an invalid value ('%s')." % (self.configuration.Data.inputDBS) msg += "\nAllowed values are: " msg += "\n ".join(["'%s' ('%s')" % (alias, url) for alias, url in DBSURLS['reader'].iteritems()]) local_dbs_urls_aliases = ['phys01', 'phys02', 'phys03'] local_dbs_urls = [DBSURLS['reader'][alias] for alias in local_dbs_urls_aliases if alias in DBSURLS['reader']] if self.configuration.Data.inputDBS in local_dbs_urls + local_dbs_urls_aliases: inputDataset_parts = self.configuration.Data.inputDataset.split('/') inputDataset_parts.pop(0) inputDataset_tier = inputDataset_parts[-1] if len(inputDataset_parts) == 3 else None user_data_tiers = ['USER'] if inputDataset_tier not in user_data_tiers: msg = "Invalid CRAB configuration: A local DBS instance '%s' was specified for reading an input dataset of tier %s." \ % (self.configuration.Data.inputDBS, inputDataset_tier) msg += "\nDatasets of tier different than %s must be read from the global DBS instance; this is, set Data.inputDBS = 'global'." \ % (", ".join(user_data_tiers[:-1]) + " or " + user_data_tiers[-1] if len(user_data_tiers) > 1 else user_data_tiers[0]) if msg: inputDBS_default = getParamDefaultValue('Data.inputDBS') if inputDBS_default: inputDBS_default, inputDBS_default_alias = self.getDBSURLAndAlias(inputDBS_default, 'reader') if inputDBS_default and inputDBS_default_alias: msg += "\nIf Data.inputDBS would not be specified, the default '%s' ('%s') would be used." % (inputDBS_default_alias, inputDBS_default) return False, msg ## If a publication DBS URL is specified and publication is ON, check that the DBS URL is a good one. if hasattr(self.configuration.Data, 'publishDBS'): if getattr(self.configuration.Data, 'publication', getParamDefaultValue('Data.publication')): dbs_urls = DBSURLS['writer'].values() dbs_urls_aliases = DBSURLS['writer'].keys() if (self.configuration.Data.publishDBS not in dbs_urls_aliases) and (self.configuration.Data.publishDBS.rstrip('/') not in dbs_urls): msg = "Invalid CRAB configuration: Parameter Data.publishDBS has an invalid value ('%s')." % (self.configuration.Data.publishDBS) msg += "\nAllowed values are: " msg += "\n ".join(["'%s' ('%s')" % (alias, url) for alias, url in DBSURLS['writer'].iteritems()]) publishDBS_default = getParamDefaultValue('Data.publishDBS') if publishDBS_default: publishDBS_default, publishDBS_default_alias = self.getDBSURLAndAlias(publishDBS_default, 'writer') if publishDBS_default and publishDBS_default_alias: msg += "\nIf Data.publishDBS would not be specified, the default '%s' ('%s') would be used." \ % (publishDBS_default_alias, publishDBS_default) return False, msg if hasattr(self.configuration.JobType, 'scriptExe'): if not os.path.isfile(self.configuration.JobType.scriptExe): msg = "Cannot find the file %s specified in the JobType.scriptExe configuration parameter." % (self.configuration.JobType.scriptExe) return False, msg return True, "Valid configuration"
def run(self, filecacheurl=None): """ Override run() for JobType """ configArguments = { 'addoutputfiles': [], 'adduserfiles': [], 'tfileoutfiles': [], 'edmoutfiles': [], } if getattr(self.config.Data, 'useParent', False) and getattr( self.config.Data, 'secondaryInputDataset', None): msg = "Invalid CRAB configuration: Parameters Data.useParent and Data.secondaryInputDataset cannot be used together." raise ConfigurationException(msg) # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({ 'jobarch': scram.getScramArch(), 'jobsw': scram.getCmsswVersion() }) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID + 'default.tgz') debugTarFilename = os.path.join(self.workdir, 'debugFiles.tgz') cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE) else: raise EnvironmentException( 'Problem with uuidgen while preparing for Sandbox upload.') else: _, tarFilename = tempfile.mkstemp(suffix='.tgz') _, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset ## Create CMSSW config. self.logger.debug("self.config: %s" % (self.config)) self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName)) ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent ## in the sense that a second loading of the same pset may not produce the same ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW ## pset twice. However, some "complicated" psets seem to evade the caching. ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that ## it can be reused later if wanted (for example, in PrivateMC when checking if ## the pset has an LHE source) instead of having to load the pset again. ## As for what does "complicated" psets mean, Daniel Riley said that there are ## some psets where one module modifies the configuration from another module. self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## If there is a CMSSW pset, do a basic validation of it. if not bootstrapDone() and self.config.JobType.psetName: valid, msg = self.cmsswCfg.validateConfig() if not valid: raise ConfigurationException(msg) ## We need to put the pickled CMSSW configuration in the right place. ## Here, we determine if the bootstrap script already run and prepared everything ## for us. In such case we move the file, otherwise we pickle.dump the pset if not bootstrapDone(): # Write out CMSSW config self.cmsswCfg.writeFile(cfgOutputName) else: # Move the pickled and the configuration files created by the bootstrap script self.moveCfgFile(cfgOutputName) ## Interrogate the CMSSW pset for output files (only output files produced by ## PoolOutputModule or TFileService are identified automatically). Do this ## automatic detection even if JobType.disableAutomaticOutputCollection = True, ## so that we can still classify the output files in EDM, TFile and additional ## output files in the Task DB (and the job ad). ## TODO: Do we really need this classification at all? cmscp and PostJob read ## the FJR to know if an output file is EDM, TFile or other. edmfiles, tfiles = self.cmsswCfg.outputFiles() ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile ## output files that are not listed in JobType.outputFiles. if getattr( self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue( 'JobType.disableAutomaticOutputCollection')): outputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) ] edmfiles = [file for file in edmfiles if file in outputFiles] tfiles = [file for file in tfiles if file in outputFiles] ## Get the list of additional output files that have to be collected as given ## in JobType.outputFiles, but remove duplicates listed already as EDM files or ## TFiles. addoutputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles + tfiles ] self.logger.debug( "The following EDM output files will be collected: %s" % edmfiles) self.logger.debug( "The following TFile output files will be collected: %s" % tfiles) self.logger.debug( "The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) ## Give warning message in case no output file was detected in the CMSSW pset ## nor was any specified in the CRAB configuration. if not configArguments['edmoutfiles'] and not configArguments[ 'tfileoutfiles'] and not configArguments['addoutputfiles']: msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) if getattr( self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue( 'JobType.disableAutomaticOutputCollection')): msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration" msg += " and no output file was explicitly specified in the CRAB configuration." else: msg += " CRAB could not detect any output file in the CMSSW configuration" msg += " nor was any explicitly specified in the CRAB configuration." msg += " Hence CRAB will not collect any output file from this task." self.logger.warning(msg) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', []) ] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [ os.path.basename(f) for f in inputFiles ] try: # convert from unicode to ascii to make it work with older pycurl versions uploadResult = tb.upload( filecacheurl=filecacheurl.encode('ascii', 'ignore')) except HTTPException as hte: if 'X-Error-Info' in hte.headers: reason = hte.headers['X-Error-Info'] reason_re = re.compile( r'\AFile size is ([0-9]*)B\. This is bigger than the maximum allowed size of ([0-9]*)B\.$' ) re_match = reason_re.match(reason) if re_match: ISBSize = int(re_match.group(1)) ISBSizeLimit = int(re_match.group(2)) reason = "%sError%s:" % (colors.RED, colors.NORMAL) reason += " Input sanbox size is ~%sMB. This is bigger than the maximum allowed size of %sMB." % ( ISBSize / 1024 / 1024, ISBSizeLimit / 1024 / 1024) ISBContent = sorted(tb.content, reverse=True) biggestFileSize = ISBContent[0][0] ndigits = int( math.ceil(math.log(biggestFileSize + 1, 10))) reason += "\nInput sanbox content sorted by size[Bytes]:" for (size, name) in ISBContent: reason += ("\n%" + str(ndigits) + "s\t%s") % (size, name) raise ClientException(reason) raise hte except Exception as e: msg = ( "Impossible to calculate the checksum of the sandbox tarball.\nError message: %s.\n" "More details can be found in %s" % (e, self.logger.logfile)) LOGGERS['CRAB3'].exception( msg) #the traceback is only printed into the logfile raise ClientException(msg) debugFilesUploadResult = None with UserTarball(name=debugTarFilename, logger=self.logger, config=self.config) as dtb: dtb.addMonFiles() try: # convert from unicode to ascii to make it work with older pycurl versions debugFilesUploadResult = dtb.upload( filecacheurl=filecacheurl.encode('ascii', 'ignore')) except Exception as e: msg = ( "Problem uploading debug_files.tar.gz.\nError message: %s.\n" "More details can be found in %s" % (e, self.logger.logfile)) LOGGERS['CRAB3'].exception( msg) #the traceback is only printed into the logfile configArguments['cacheurl'] = filecacheurl configArguments['cachefilename'] = "%s.tar.gz" % uploadResult if debugFilesUploadResult is not None: configArguments[ 'debugfilename'] = "%s.tar.gz" % debugFilesUploadResult self.logger.debug("Result uploading input files: %(cachefilename)s " % configArguments) # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, 'userInputFiles', None) if userFilesList: self.logger.debug( "Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments['userfiles'] = set(userFilesList) configArguments['primarydataset'] = getattr( self.config.Data, 'outputPrimaryDataset', 'CRAB_UserFiles') lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) try: lumi_list = getLumiList(lumi_mask_name, logger=self.logger) except ValueError as ex: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex) raise ConfigurationException(msg) run_ranges = getattr(self.config.Data, 'runRange', None) if run_ranges: run_ranges_is_valid = re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) if not lumi_list: msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null." raise ConfigurationException(msg) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str( len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs=run_list) else: msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'" raise ConfigurationException(msg) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [ str(reduce(lambda x, y: x + y, lumi_mask[run]))[1:-1].replace(' ', '') for run in configArguments['runs'] ] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments
def validateConfig(self, config): """ Validate the CMSSW portion of the config file making sure required values are there and optional values don't conflict. """ valid, reason = self.validateBasicConfig(config) if not valid: return valid, reason ## Make sure only one of the two parameters Data.inputDataset and Data.userInputFiles ## was specified. if getattr(config.Data, 'inputDataset', None) and getattr( config.Data, 'userInputFiles', None): msg = "Invalid CRAB configuration: Analysis job type accepts either an input dataset or a set of user input files to run on, but not both." msg += "\nSuggestion: Specify only one of the two parameters, Data.inputDataset or Data.userInputFiles, but not both." return False, msg ## Make sure at least one of the two parameters Data.inputDataset and Data.userInputFiles ## was specified. if not getattr(config.Data, 'inputDataset', None) and not getattr( config.Data, 'userInputFiles', None): msg = "Invalid CRAB configuration: Analysis job type requires an input dataset or a set of user input files to run on." msg += "\nSuggestion: To specify an input dataset use the parameter Data.inputDataset." msg += " To specify a set of user input files use the parameter Data.userInputFiles." return False, msg ## When running over an input dataset, we don't accept that the user specifies a ## primary dataset, because the primary dataset will already be extracted from ## the input dataset. if getattr(config.Data, 'inputDataset', None) and getattr( config.Data, 'outputPrimaryDataset', None): msg = "Invalid CRAB configuration: Analysis job type with input dataset does not accept an output primary dataset name to be specified," msg += " because the later will be extracted from the first." msg += "\nSuggestion: Remove the parameter Data.outputPrimaryDataset." return False, msg ## When running over user input files with publication turned on, we want the ## user to specify the primary dataset to be used for publication. if getattr(config.Data, 'publication', getParamDefaultValue('Data.publication')): if not getattr(config.Data, 'inputDataset', None): if not getattr(config.Data, 'outputPrimaryDataset', None): msg = "Invalid CRAB configuration: Parameter Data.outputPrimaryDataset not specified." msg += "\nAnalysis job type without input dataset requires this parameter for publication." return False, msg ## When running over user input files, make sure the splitting mode is 'FileBased'. if getattr(config.Data, 'userInputFiles', None) and self.splitAlgo != 'FileBased': msg = "Invalid CRAB configuration: Analysis job type with user input files only supports file-based splitting." msg += "\nSuggestion: Set Data.splitting = 'FileBased'." return False, msg ## Make sure the splitting algorithm is valid. allowedSplitAlgos = ['FileBased', 'LumiBased', 'EventAwareLumiBased'] self.checkAutomaticAvail(allowedSplitAlgos) if self.splitAlgo not in allowedSplitAlgos: msg = "Invalid CRAB configuration: Parameter Data.splitting has an invalid value ('%s')." % ( self.splitAlgo) msg += "\nAnalysis job type only supports the following splitting algorithms: %s." % ( allowedSplitAlgos) return False, msg return True, "Valid configuration"
def validateConfig(self): """ __validateConfig__ Checking if needed input parameters are there. Not all the commands require a configuration. """ # Check that the configuration object has the sections we expect it to have. # (WMCore already checks that attributes added to the configuration object are of type ConfigSection.) # Even if not all configuration sections need to be there, we anyway request # the user to add all the sections in the configuration file. if not hasattr(self.configuration, 'General'): msg = "Invalid CRAB configuration: Section 'General' is missing." return False, msg if not hasattr(self.configuration, 'JobType'): msg = "Invalid CRAB configuration: Section 'JobType' is missing." return False, msg if not hasattr(self.configuration, 'Data'): msg = "Invalid CRAB configuration: Section 'Data' is missing." return False, msg if not hasattr(self.configuration, 'Site'): msg = "Invalid CRAB configuration: Section 'Site' is missing." return False, msg return True, "Valid configuration" ## Some parameters may have been renamed. Check here if the configuration file has an old ## parameter defined, and in that case tell the user what is the new parameter name. for old_param, new_param in renamedParams.items(): if len(old_param.split('.')) != 2 or len( new_param['newParam'].split('.')) != 2: continue old_param_section, old_param_name = old_param.split('.') if hasattr(self.configuration, old_param_section) and hasattr( getattr(self.configuration, old_param_section), old_param_name): msg = "Invalid CRAB configuration: Parameter %s has been renamed to %s" % ( old_param, new_param['newParam']) if new_param['version'] != None: msg += " starting from CRAB %s" % (new_param['version']) msg += "; please change your configuration file accordingly." return False, msg # Check if there are unknown parameters (and try to suggest the correct parameter name). all_config_params = configParametersInfo.keys() SpellChecker.DICTIONARY = SpellChecker.train(all_config_params) for section in self.configuration.listSections_(): for attr in getattr(self.configuration, section).listSections_(): param = (section + '.' + attr) if not SpellChecker.is_correct(param): msg = "Invalid CRAB configuration: Parameter %s is not known." % param if SpellChecker.correct(param) != param: msg += " Maybe you mean %s?" % ( SpellChecker.correct(param)) return False, msg ## Check that each parameter specified in the configuration file is of the ## type specified in the configuration map. ## Check that, if a parameter is a required one and it has no default value, ## then it must be specified in the configuration file. for paramName, paramInfo in configParametersInfo.items(): requiredTypeName = paramInfo['type'] try: requiredType = getattr(types, requiredTypeName) except AttributeError: msg = "Invalid type %s specified in CRABClient configuration mapping for parameter %s." % ( requiredTypeName, paramName) return False, msg attrs = paramName.split('.') obj = self.configuration while attrs and obj is not None: obj = getattr(obj, attrs.pop(0), None) if obj is not None: if not isinstance(obj, requiredType): msg = "Invalid CRAB configuration: Parameter %s requires a value of type %s (while a value of type %s was given)." \ % (paramName, str(requiredType), str(type(obj))) if paramName == "Data.totalUnits" and isinstance( obj, float): continue if paramName == "Data.userInputFiles": msg += "\nIn CRAB v3.3.14 the configuration parameter Data.userInputFiles has been modified to directly take a (python) list of primary input files." msg += " Previously it was taking the name of a local text file where the primary input files were listed." msg += " One can still use a text file and convert its content into a python list by doing Data.userInputFiles = list(open('my_list_of_files.txt'))" return False, msg elif requiredType == list: if not all(isinstance(arg, str) for arg in obj): msg = "Invalid CRAB configuration: Parameter %s has to be a list of strings." % ( paramName) return False, msg elif getParamDefaultValue( paramName) is None and paramInfo['required']: msg = "Invalid CRAB configuration: Parameter %s is missing." % ( paramName) return False, msg return True, "Valid configuration"
def validateConfig(self): """ __validateConfig__ Checking if needed input parameters are there """ valid, msg = SubCommand.validateConfig(self) if not valid: return False, msg requestNameLenLimit = 100 if hasattr(self.configuration.General, 'requestName'): if len(self.configuration.General.requestName ) > requestNameLenLimit: msg = "Invalid CRAB configuration: Parameter General.requestName should not be longer than %d characters." % ( requestNameLenLimit) return False, msg splitting = getattr(self.configuration.Data, 'splitting', 'Automatic') autoSplitt = True if splitting == 'Automatic' else False autoSplittUnitsMin = 180 # 3 hours (defined also in TW config as 'minAutomaticRuntimeMins') autoSplittUnitsMax = 2700 # 45 hours ## Check that maxJobRuntimeMin is not used with Automatic splitting if autoSplitt and hasattr(self.configuration.JobType, 'maxJobRuntimeMin'): msg = "The 'maxJobRuntimeMin' parameter is not compatible with the 'Automatic' splitting mode (default)." return False, msg ## Check that --dryrun is not used with Automatic splitting if autoSplitt and self.options.dryrun: msg = "The 'dryrun' option is not compatible with the 'Automatic' splitting mode (default)." return False, msg ## Check that Data.unitsPerjob is specified. if hasattr(self.configuration.Data, 'unitsPerJob'): try: float(self.configuration.Data.unitsPerJob) except ValueError: msg = "Invalid CRAB configuration: Parameter Data.unitsPerJob must be a valid number, not %s." % ( self.configuration.Data.unitsPerJob) return False, msg if not int(self.configuration.Data.unitsPerJob) > 0: msg = "Invalid CRAB configuration: Parameter Data.unitsPerJob must be > 0, not %s." % ( self.configuration.Data.unitsPerJob) return False, msg if autoSplitt and ( self.configuration.Data.unitsPerJob > autoSplittUnitsMax or self.configuration.Data.unitsPerJob < autoSplittUnitsMin): msg = "Invalid CRAB configuration: In case of Automatic splitting, the Data.unitsPerJob parameter must be in the [%d, %d] minutes range. You asked for %d minutes." % ( autoSplittUnitsMin, autoSplittUnitsMax, self.configuration.Data.unitsPerJob) return False, msg elif not autoSplitt: # The default value is only valid for automatic splitting! msg = "Invalid CRAB configuration: Parameter Data.unitsPerJob is mandatory for '%s' splitting mode." % splitting return False, msg ## Check that JobType.pluginName and JobType.externalPluginFile are not both specified. if hasattr(self.configuration.JobType, 'pluginName') and hasattr( self.configuration.JobType, 'externalPluginFile'): msg = "Invalid CRAB configuration: Only one of JobType.pluginName or JobType.externalPluginFile parameters can be specified." pluginName_default = getParamDefaultValue('JobType.pluginName') if pluginName_default: msg += "\nIf neither JobType.pluginName nor JobType.externalPluginFile would be specified," msg += " the default JobType.pluginName = '%s' would be used." % ( pluginName_default) return False, msg ## Load the external plugin or check that the crab plugin is valid. external_plugin_name = getattr(self.configuration.JobType, 'externalPluginFile', None) crab_plugin_name = getattr(self.configuration.JobType, 'pluginName', None) crab_job_types = { 'ANALYSIS': None, 'PRIVATEMC': None, 'COPYCAT': None } #getJobTypes() if external_plugin_name: addPlugin(external_plugin_name) # Do we need to do this here? if crab_plugin_name: if upper(crab_plugin_name) not in crab_job_types: msg = "Invalid CRAB configuration: Parameter JobType.pluginName has an invalid value ('%s')." % ( crab_plugin_name) msg += "\nAllowed values are: %s." % (", ".join( ['%s' % job_type for job_type in crab_job_types.keys()])) return False, msg msg = "Will use CRAB %s plugin" % ("Analysis" if upper( crab_plugin_name) == 'ANALYSIS' else "PrivateMC") msg += " (i.e. will run %s job type)." % ("an analysis" if upper( crab_plugin_name) == 'ANALYSIS' else "a MC generation") self.logger.debug(msg) ## Check that the particular combination (Data.publication = True, General.transferOutputs = False) is not specified. if getattr(self.configuration.Data, 'publication', getParamDefaultValue('Data.publication')) and \ not getattr(self.configuration.General, 'transferOutputs', getParamDefaultValue('General.transferOutputs')): msg = "Invalid CRAB configuration: Data.publication is True, but General.transferOutputs is False." msg += "\nPublication can not be performed if the output files are not transferred to a permanent storage." return False, msg ## Check that a storage site is specified if General.transferOutputs = True or General.transferLogs = True. if not hasattr(self.configuration.Site, 'storageSite'): if getattr(self.configuration.General, 'transferLogs', getParamDefaultValue('General.transferLogs')) or \ getattr(self.configuration.General, 'transferOutputs', getParamDefaultValue('General.transferOutputs')): msg = "Invalid CRAB configuration: Parameter Site.storageSite is missing." return False, msg ## If an input dataset and a DBS URL are specified, check that the DBS URL is a good one. ## Also, if the DBS URL is 'phys0x', check that the input dataset tier is USER. if hasattr(self.configuration.Data, 'inputDBS'): if hasattr(self.configuration.Data, 'inputDataset'): msg = None dbs_urls_aliases = DBSURLS['reader'].keys() dbs_urls = DBSURLS['reader'].values() if (self.configuration.Data.inputDBS not in dbs_urls_aliases ) and (self.configuration.Data.inputDBS.rstrip('/') not in dbs_urls): msg = "Invalid CRAB configuration: Parameter Data.inputDBS has an invalid value ('%s')." % ( self.configuration.Data.inputDBS) msg += "\nAllowed values are: " msg += "\n ".join([ "'%s' ('%s')" % (alias, url) for alias, url in DBSURLS['reader'].iteritems() ]) local_dbs_urls_aliases = ['phys01', 'phys02', 'phys03'] local_dbs_urls = [ DBSURLS['reader'][alias] for alias in local_dbs_urls_aliases if alias in DBSURLS['reader'] ] if self.configuration.Data.inputDBS in local_dbs_urls + local_dbs_urls_aliases: inputDataset_parts = self.configuration.Data.inputDataset.split( '/') inputDataset_parts.pop(0) inputDataset_tier = inputDataset_parts[-1] if len( inputDataset_parts) == 3 else None user_data_tiers = ['USER'] if inputDataset_tier not in user_data_tiers: msg = "Invalid CRAB configuration: A local DBS instance '%s' was specified for reading an input dataset of tier %s." \ % (self.configuration.Data.inputDBS, inputDataset_tier) msg += "\nDatasets of tier different than %s must be read from the global DBS instance; this is, set Data.inputDBS = 'global'." \ % (", ".join(user_data_tiers[:-1]) + " or " + user_data_tiers[-1] if len(user_data_tiers) > 1 else user_data_tiers[0]) if msg: inputDBS_default = getParamDefaultValue('Data.inputDBS') if inputDBS_default: inputDBS_default, inputDBS_default_alias = self.getDBSURLAndAlias( inputDBS_default, 'reader') if inputDBS_default and inputDBS_default_alias: msg += "\nIf Data.inputDBS would not be specified, the default '%s' ('%s') would be used." % ( inputDBS_default_alias, inputDBS_default) return False, msg ## If a publication DBS URL is specified and publication is ON, check that the DBS URL is a good one. if hasattr(self.configuration.Data, 'publishDBS'): if getattr(self.configuration.Data, 'publication', getParamDefaultValue('Data.publication')): dbs_urls = DBSURLS['writer'].values() dbs_urls_aliases = DBSURLS['writer'].keys() if (self.configuration.Data.publishDBS not in dbs_urls_aliases ) and (self.configuration.Data.publishDBS.rstrip('/') not in dbs_urls): msg = "Invalid CRAB configuration: Parameter Data.publishDBS has an invalid value ('%s')." % ( self.configuration.Data.publishDBS) msg += "\nAllowed values are: " msg += "\n ".join([ "'%s' ('%s')" % (alias, url) for alias, url in DBSURLS['writer'].iteritems() ]) publishDBS_default = getParamDefaultValue( 'Data.publishDBS') if publishDBS_default: publishDBS_default, publishDBS_default_alias = self.getDBSURLAndAlias( publishDBS_default, 'writer') if publishDBS_default and publishDBS_default_alias: msg += "\nIf Data.publishDBS would not be specified, the default '%s' ('%s') would be used." \ % (publishDBS_default_alias, publishDBS_default) return False, msg if hasattr(self.configuration.JobType, 'scriptExe'): if not os.path.isfile(self.configuration.JobType.scriptExe): msg = "Cannot find the file %s specified in the JobType.scriptExe configuration parameter." % ( self.configuration.JobType.scriptExe) return False, msg ## If ignoreLocality is set, check that a sitewhilelist is present if getattr(self.configuration.Data, 'ignoreLocality', False): if not hasattr(self.configuration.Site, 'whitelist'): msg = "Invalid CRAB configuration:\n when ignoreLocality is set a valid site white list must be specified using the Site.whitelist parameter" return False, msg if hasattr(self.configuration.General, 'failureLimit'): msg = "You have specified deprecated parameter 'failureLimit' which will be removed in the near future." msg += "\nIf you really need it write a mail to hn-cms-computingTools explaining your use case." self.logger.warning("%sWARNING%s: %s" % (colors.RED, colors.NORMAL, msg)) return True, "Valid configuration"
def validateConfig(self): """ __validateConfig__ Checking if needed input parameters are there """ valid, msg = SubCommand.validateConfig(self) if not valid: return False, msg ## Check that Data.unitsPerjob is specified. if hasattr(self.configuration.Data, 'unitsPerJob'): try: float(self.configuration.Data.unitsPerJob) except ValueError: msg = "Invalid CRAB configuration: Parameter Data.unitsPerJob must be a valid number, not %s." % ( self.configuration.Data.unitsPerJob) return False, msg ## Check that JobType.pluginName and JobType.externalPluginFile are not both specified. if hasattr(self.configuration.JobType, 'pluginName') and hasattr( self.configuration.JobType, 'externalPluginFile'): msg = "Invalid CRAB configuration: Only one of JobType.pluginName or JobType.externalPluginFile parameters can be specified." pluginName_default = getParamDefaultValue('JobType.pluginName') if pluginName_default: msg += "\nIf neither JobType.pluginName nor JobType.externalPluginFile would be specified," msg += " the default JobType.pluginName = '%s' would be used." % ( pluginName_default) return False, msg ## Load the external plugin or check that the crab plugin is valid. external_plugin_name = getattr(self.configuration.JobType, 'externalPluginFile', None) crab_plugin_name = getattr(self.configuration.JobType, 'pluginName', None) crab_job_types = {'ANALYSIS': None, 'PRIVATEMC': None} #getJobTypes() if external_plugin_name: addPlugin(external_plugin_name) # Do we need to do this here? if crab_plugin_name: if upper(crab_plugin_name) not in crab_job_types: msg = "Invalid CRAB configuration: Parameter JobType.pluginName has an invalid value ('%s')." % ( crab_plugin_name) msg += "\nAllowed values are: %s." % (", ".join( ['%s' % job_type for job_type in crab_job_types.keys()])) return False, msg msg = "Will use CRAB %s plugin" % ("Analysis" if upper( crab_plugin_name) == 'ANALYSIS' else "PrivateMC") msg += " (i.e. will run %s job type)." % ("an analysis" if upper( crab_plugin_name) == 'ANALYSIS' else "a MC generation") self.logger.debug(msg) ## Check that the particular combination (Data.publication = True, General.transferOutputs = False) is not specified. if getattr(self.configuration.Data, 'publication', getParamDefaultValue('Data.publication')) and \ not getattr(self.configuration.General, 'transferOutputs', getParamDefaultValue('General.transferOutputs')): msg = "Invalid CRAB configuration: Data.publication is True, but General.transferOutputs is False." msg += "\nPublication can not be performed if the output files are not transferred to a permanent storage." return False, msg ## Check that a storage site is specified if General.transferOutputs = True or General.transferLogs = True. if not hasattr(self.configuration.Site, 'storageSite'): if getattr(self.configuration.General, 'transferLogs', getParamDefaultValue('General.transferLogs')) or \ getattr(self.configuration.General, 'transferOutputs', getParamDefaultValue('General.transferOutputs')): msg = "Invalid CRAB configuration: Parameter Site.storageSite is missing." return False, msg ## If an input dataset and a DBS URL are specified, check that the DBS URL is a good one. ## Also, if the DBS URL is 'phys0x', check that the input dataset tier is USER. if hasattr(self.configuration.Data, 'inputDBS'): if hasattr(self.configuration.Data, 'inputDataset'): msg = None dbs_urls_aliases = DBSURLS['reader'].keys() dbs_urls = DBSURLS['reader'].values() if (self.configuration.Data.inputDBS not in dbs_urls_aliases ) and (self.configuration.Data.inputDBS.rstrip('/') not in dbs_urls): msg = "Invalid CRAB configuration: Parameter Data.inputDBS has an invalid value ('%s')." % ( self.configuration.Data.inputDBS) msg += "\nAllowed values are: " msg += "\n ".join([ "'%s' ('%s')" % (alias, url) for alias, url in DBSURLS['reader'].iteritems() ]) local_dbs_urls_aliases = ['phys01', 'phys02', 'phys03'] local_dbs_urls = [ DBSURLS['reader'][alias] for alias in local_dbs_urls_aliases if alias in DBSURLS['reader'] ] if self.configuration.Data.inputDBS in local_dbs_urls + local_dbs_urls_aliases: inputDataset_parts = self.configuration.Data.inputDataset.split( '/') inputDataset_parts.pop(0) inputDataset_tier = inputDataset_parts[-1] if len( inputDataset_parts) == 3 else None user_data_tiers = ['USER'] if inputDataset_tier not in user_data_tiers: msg = "Invalid CRAB configuration: A local DBS instance '%s' was specified for reading an input dataset of tier %s." \ % (self.configuration.Data.inputDBS, inputDataset_tier) msg += "\nDatasets of tier different than %s must be read from the global DBS instance; this is, set Data.inputDBS = 'global'." \ % (", ".join(user_data_tiers[:-1]) + " or " + user_data_tiers[-1] if len(user_data_tiers) > 1 else user_data_tiers[0]) if msg: inputDBS_default = getParamDefaultValue('Data.inputDBS') if inputDBS_default: inputDBS_default, inputDBS_default_alias = self.getDBSURLAndAlias( inputDBS_default, 'reader') if inputDBS_default and inputDBS_default_alias: msg += "\nIf Data.inputDBS would not be specified, the default '%s' ('%s') would be used." % ( inputDBS_default_alias, inputDBS_default) return False, msg ## If a publication DBS URL is specified and publication is ON, check that the DBS URL is a good one. if hasattr(self.configuration.Data, 'publishDBS'): if getattr(self.configuration.Data, 'publication', getParamDefaultValue('Data.publication')): dbs_urls = DBSURLS['writer'].values() dbs_urls_aliases = DBSURLS['writer'].keys() if (self.configuration.Data.publishDBS not in dbs_urls_aliases ) and (self.configuration.Data.publishDBS.rstrip('/') not in dbs_urls): msg = "Invalid CRAB configuration: Parameter Data.publishDBS has an invalid value ('%s')." % ( self.configuration.Data.publishDBS) msg += "\nAllowed values are: " msg += "\n ".join([ "'%s' ('%s')" % (alias, url) for alias, url in DBSURLS['writer'].iteritems() ]) publishDBS_default = getParamDefaultValue( 'Data.publishDBS') if publishDBS_default: publishDBS_default, publishDBS_default_alias = self.getDBSURLAndAlias( publishDBS_default, 'writer') if publishDBS_default and publishDBS_default_alias: msg += "\nIf Data.publishDBS would not be specified, the default '%s' ('%s') would be used." \ % (publishDBS_default_alias, publishDBS_default) return False, msg if hasattr(self.configuration.JobType, 'scriptExe'): if not os.path.isfile(self.configuration.JobType.scriptExe): msg = "Cannot find the file %s specified in the JobType.scriptExe configuration parameter." % ( self.configuration.JobType.scriptExe) return False, msg return True, "Valid configuration"
def run(self, filecacheurl=None): """ Override run() for JobType """ configArguments = {"addoutputfiles": [], "adduserfiles": [], "tfileoutfiles": [], "edmoutfiles": []} # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({"jobarch": scram.getScramArch(), "jobsw": scram.getCmsswVersion()}) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug("UNIQUE NAME: tarUUID %s " % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID + "default.tgz") cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE) else: raise EnvironmentException("Problem with uuidgen while preparing for Sandbox upload.") else: _dummy, tarFilename = tempfile.mkstemp(suffix=".tgz") _dummy, cfgOutputName = tempfile.mkstemp(suffix="_cfg.py") if getattr(self.config.Data, "inputDataset", None): configArguments["inputdata"] = self.config.Data.inputDataset ## Create CMSSW config. self.logger.debug("self.config: %s" % (self.config)) self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName)) ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent ## in the sense that a second loading of the same pset may not produce the same ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW ## pset twice. However, some "complicated" psets seem to evade the caching. ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that ## it can be reused later if wanted (for example, in PrivateMC when checking if ## the pset has an LHE source) instead of having to load the pset again. ## As for what does "complicated" psets mean, Daniel Riley said that there are ## some psets where one module modifies the configuration from another module. self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## If there is a CMSSW pset, do a basic validation of it. if not bootstrapDone() and self.config.JobType.psetName: valid, msg = self.cmsswCfg.validateConfig() if not valid: raise ConfigurationException(msg) ## We need to put the pickled CMSSW configuration in the right place. ## Here, we determine if the bootstrap script already run and prepared everything ## for us. In such case we move the file, otherwise we pickle.dump the pset if not bootstrapDone(): # Write out CMSSW config self.cmsswCfg.writeFile(cfgOutputName) else: # Move the pickled configuration file created by the bootstrap script self.moveCfgFile(cfgOutputName) ## Interrogate the CMSSW pset for output files (only output files produced by ## PoolOutputModule or TFileService are identified automatically). Do this ## automatic detection even if JobType.disableAutomaticOutputCollection = True, ## so that we can still classify the output files in EDM, TFile and additional ## output files in the Task DB (and the job ad). ## TODO: Do we really need this classification at all? cmscp and PostJob read ## the FJR to know if an output file is EDM, TFile or other. edmfiles, tfiles = self.cmsswCfg.outputFiles() ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile ## output files that are not listed in JobType.outputFiles. if getattr( self.config.JobType, "disableAutomaticOutputCollection", getParamDefaultValue("JobType.disableAutomaticOutputCollection"), ): outputFiles = [re.sub(r"^file:", "", file) for file in getattr(self.config.JobType, "outputFiles", [])] edmfiles = [file for file in edmfiles if file in outputFiles] tfiles = [file for file in tfiles if file in outputFiles] ## Get the list of additional output files that have to be collected as given ## in JobType.outputFiles, but remove duplicates listed already as EDM files or ## TFiles. addoutputFiles = [ re.sub(r"^file:", "", file) for file in getattr(self.config.JobType, "outputFiles", []) if re.sub(r"^file:", "", file) not in edmfiles + tfiles ] self.logger.debug("The following EDM output files will be collected: %s" % edmfiles) self.logger.debug("The following TFile output files will be collected: %s" % tfiles) self.logger.debug("The following user output files will be collected: %s" % addoutputFiles) configArguments["edmoutfiles"] = edmfiles configArguments["tfileoutfiles"] = tfiles configArguments["addoutputfiles"].extend(addoutputFiles) ## Give warning message in case no output file was detected in the CMSSW pset ## nor was any specified in the CRAB configuration. if ( not configArguments["edmoutfiles"] and not configArguments["tfileoutfiles"] and not configArguments["addoutputfiles"] ): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) if getattr( self.config.JobType, "disableAutomaticOutputCollection", getParamDefaultValue("JobType.disableAutomaticOutputCollection"), ): msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration" msg += " and no output file was explicitly specified in the CRAB configuration." else: msg += " CRAB could not detect any output file in the CMSSW configuration" msg += " nor was any explicitly specified in the CRAB configuration." msg += " Hence CRAB will not collect any output file from this task." self.logger.warning(msg) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [re.sub(r"^file:", "", file) for file in getattr(self.config.JobType, "inputFiles", [])] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments["adduserfiles"] = [os.path.basename(f) for f in inputFiles] uploadResults = tb.upload(filecacheurl=filecacheurl) self.logger.debug("Result uploading input files: %s " % str(uploadResults)) configArguments["cacheurl"] = filecacheurl configArguments["cachefilename"] = uploadResults[0] isbchecksum = uploadResults[1] # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, "userInputFiles", None) if userFilesList: self.logger.debug("Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments["userfiles"] = set(userFilesList) ## Get the user-specified primary dataset name. primaryDataset = getattr(self.config.Data, "primaryDataset", "CRAB_UserFiles") # Normalizes "foo/bar" and "/foo/bar" to "/foo/bar" primaryDataset = "/" + os.path.join(*primaryDataset.split("/")) if not re.match("/%(primDS)s.*" % (lfnParts), primaryDataset): self.logger.warning("Invalid primary dataset name %s; publication may fail." % (primaryDataset)) configArguments["inputdata"] = primaryDataset lumi_mask_name = getattr(self.config.Data, "lumiMask", None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) try: lumi_list = getLumiList(lumi_mask_name, logger=self.logger) except ValueError as ex: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex) raise ConfigurationException(msg) run_ranges = getattr(self.config.Data, "runRange", None) if run_ranges: run_ranges_is_valid = re.match("^\d+((?!(-\d+-))(\,|\-)\d+)*$", run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) if not lumi_list: msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null." raise ConfigurationException(msg) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str(len(run_list)) msg += ( " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." ) raise ConfigurationException(msg) lumi_list = LumiList(runs=run_list) else: msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'" raise ConfigurationException(msg) if lumi_list: configArguments["runs"] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments["lumis"] = [ str(reduce(lambda x, y: x + y, lumi_mask[run]))[1:-1].replace(" ", "") for run in configArguments["runs"] ] configArguments["jobtype"] = "Analysis" return tarFilename, configArguments, isbchecksum
def validateConfig(self): """ __validateConfig__ Checking if needed input parameters are there """ valid, msg = SubCommand.validateConfig(self) if not valid: return False, msg ## Check that the configuration object has the sections we expect it to have. ## (WMCore already checks that attributes added to the configuration object are of type ConfigSection.) ## Even if not all configuration sections need to be there, we anyway request ## the user to add all the sections in the configuration file. if not hasattr(self.configuration, 'General'): msg = "CRAB configuration problem: Section 'General' is missing" return False, msg if not hasattr(self.configuration, 'JobType'): msg = "CRAB configuration problem: Section 'JobType' is missing" return False, msg if not hasattr(self.configuration, 'Data'): msg = "CRAB configuration problem: Section 'Data' is missing" return False, msg if not hasattr(self.configuration, 'Site'): msg = "CRAB configuration problem: Section 'Site' is missing" return False, msg ## Some parameters may have been renamed. Check here if the configuration file has an old ## parameter defined, and in that case tell the user what is the new parameter name. for old_param, new_param in renamed_params.iteritems(): if len(old_param.split('.')) != 2 or len( new_param.split('.')) != 2: continue old_param_section, old_param_name = old_param.split('.') if hasattr(self.configuration, old_param_section) and hasattr( getattr(self.configuration, old_param_section), old_param_name): msg = "CRAB configuration problem: Parameter %s has been renamed to %s; please change your configuration file accordingly" % ( old_param, new_param) return False, msg ## Check that Data.unitsPerjob is specified. if hasattr(self.configuration.Data, 'unitsPerJob'): try: float(self.configuration.Data.unitsPerJob) except ValueError: msg = "CRAB configuration problem: Parameter Data.unitsPerJob must be a valid number, not %s" % self.configuration.Data.unitsPerJob return False, msg ## Check that JobType.pluginName and JobType.externalPluginFile are not both specified. if hasattr(self.configuration.JobType, 'pluginName') and hasattr( self.configuration.JobType, 'externalPluginFile'): msg = "CRAB configuration problem: Only one of JobType.pluginName or JobType.externalPluginFile parameters can be specified" pluginName_default = getParamDefaultValue('JobType.pluginName') if pluginName_default: msg += "\nIf neither JobType.pluginName nor JobType.externalPluginFile would be specified, the default JobType.pluginName = '%s' would be used" \ % pluginName_default return False, msg ## Load the external plugin or check that the crab plugin is valid. external_plugin_name = getattr(self.configuration.JobType, 'externalPluginFile', None) crab_plugin_name = getattr(self.configuration.JobType, 'pluginName', None) crab_job_types = {'ANALYSIS': None, 'PRIVATEMC': None} #getJobTypes() if external_plugin_name: addPlugin(external_plugin_name) # Do we need to do this here? if crab_plugin_name and upper(crab_plugin_name) not in crab_job_types: msg = "CRAB configuration problem: Parameter JobType.pluginName has an invalid value '%s'" % crab_plugin_name msg += "\nAllowed values are: %s" % ", ".join( ['%s' % job_type for job_type in crab_job_types.keys()]) return False, msg ## Check that the particular combination (Data.publication = True, General.transferOutputs = False) is not specified. if hasattr(self.configuration.Data, 'publication') and hasattr( self.configuration.General, 'transferOutputs'): if self.configuration.Data.publication and not self.configuration.General.transferOutputs: msg = "CRAB configuration problem: Data.publication is on, but General.transferOutputs is off" msg += "\nPublication can not be performed if the output files are not transferred to a permanent storage" return False, msg ## Check that a storage site is specified if General.transferOutputs = True or General.transferLogs = True. if not hasattr(self.configuration.Site, 'storageSite'): if (hasattr(self.configuration.General, 'transferOutputs') and self.configuration.General.transferOutputs) or \ (hasattr(self.configuration.General, 'transferLogs') and self.configuration.General.transferLogs): msg = "CRAB configuration problem: Parameter Site.storageSite is missing" return False, msg ## If an input dataset and a DBS URL are specified, check that the DBS URL is a good one. ## Also, if the DBS URL is 'phys0x', check that the input dataset tier is USER. if hasattr(self.configuration.Data, 'inputDBS'): if hasattr(self.configuration.Data, 'inputDataset'): msg = None dbs_urls_aliases = DBSURLS['reader'].keys() dbs_urls = DBSURLS['reader'].values() if (self.configuration.Data.inputDBS not in dbs_urls_aliases ) and (self.configuration.Data.inputDBS.rstrip('/') not in dbs_urls): msg = "CRAB configuration problem: Parameter Data.inputDBS has an invalid value '%s'" % self.configuration.Data.inputDBS msg += "\nAllowed values are: " msg += "\n ".join([ "'%s' ('%s')" % (alias, url) for alias, url in DBSURLS['reader'].iteritems() ]) local_dbs_urls_aliases = ['phys01', 'phys02', 'phys03'] local_dbs_urls = [ DBSURLS['reader'][alias] for alias in local_dbs_urls_aliases if alias in DBSURLS['reader'] ] if self.configuration.Data.inputDBS in local_dbs_urls + local_dbs_urls_aliases: inputDataset_parts = self.configuration.Data.inputDataset.split( '/') inputDataset_parts.pop(0) inputDataset_tier = inputDataset_parts[-1] if len( inputDataset_parts) == 3 else None user_data_tiers = ['USER'] if inputDataset_tier not in user_data_tiers: msg = "CRAB configuration problem: A local DBS instance '%s' was specified for reading an input dataset of tier %s" \ % (self.configuration.Data.inputDBS, inputDataset_tier) msg += "\nDatasets of tier different than %s must be read from the global DBS instance; this is, set Data.inputDBS = 'global'" \ % (", ".join(user_data_tiers[:-1]) + " or " + user_data_tiers[-1] if len(user_data_tiers) > 1 else user_data_tiers[0]) if msg: inputDBS_default = getParamDefaultValue('Data.inputDBS') if inputDBS_default: inputDBS_default, inputDBS_default_alias = self.getDBSURLAndAlias( inputDBS_default, 'reader') if inputDBS_default and inputDBS_default_alias: msg += "\nIf Data.inputDBS would not be specified, the default '%s' ('%s') would be used" % ( inputDBS_default_alias, inputDBS_default) return False, msg ## If a publication DBS URL is specified and publication is ON, check that the DBS URL is a good one. if hasattr(self.configuration.Data, 'publishDBS'): publication_default = getParamDefaultValue('Data.publication') if getattr(self.configuration.Data, 'publication', publication_default): dbs_urls = DBSURLS['writer'].values() dbs_urls_aliases = DBSURLS['writer'].keys() if (self.configuration.Data.publishDBS not in dbs_urls_aliases ) and (self.configuration.Data.publishDBS.rstrip('/') not in dbs_urls): msg = "CRAB configuration problem: Parameter Data.publishDBS has an invalid value '%s'" % self.configuration.Data.publishDBS msg += "\nAllowed values are: " msg += "\n ".join([ "'%s' ('%s')" % (alias, url) for alias, url in DBSURLS['writer'].iteritems() ]) publishDBS_default = getParamDefaultValue( 'Data.publishDBS') if publishDBS_default: publishDBS_default, publishDBS_default_alias = self.getDBSURLAndAlias( publishDBS_default, 'writer') if publishDBS_default and publishDBS_default_alias: msg += "\nIf Data.publishDBS would not be specified, the default '%s' ('%s') would be used" \ % (publishDBS_default_alias, publishDBS_default) return False, msg if hasattr(self.configuration.JobType, 'scriptExe'): if not os.path.isfile(self.configuration.JobType.scriptExe): msg = "Cannot find the file %s specified in the scriptExe configuration parameter" % self.configuration.JobType.scriptExe return False, msg return True, "Valid configuration"
def run(self, filecacheurl = None): """ Override run() for JobType """ configArguments = {'addoutputfiles' : [], 'adduserfiles' : [], 'tfileoutfiles' : [], 'edmoutfiles' : [], } # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({'jobarch' : scram.scramArch, 'jobsw' : scram.cmsswVersion, }) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID +'default.tgz') cfgOutputName = os.path.join(self.workdir, 'CMSSW_cfg.py') else: raise EnvironmentException('Problem with uuidgen while preparing for Sandbox upload.') else: _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz') _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset # configArguments['ProcessingVersion'] = getattr(self.config.Data, 'processingVersion', None) ## Create CMSSW config. self.logger.debug("self.config: %s" % (self.config)) self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName)) ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent ## in the sense that a second loading of the same pset may not produce the same ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW ## pset twice. However, some "complicated" psets seem to evade the caching. ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that ## it can be reused later if wanted (for example, in PrivateMC when checking if ## the pset has an LHE source) instead of having to load the pset again. ## As for what does "complicated" psets mean, Daniel Riley said that there are ## some psets where one module modifies the configuration from another module. self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## Interrogate the CMSSW pset for output files (only output files produced by ## PoolOutputModule or TFileService are identified automatically). Do this ## automatic detection even if JobType.disableAutomaticOutputCollection = True, ## so that we can still classify the output files in EDM, TFile and additional ## output files in the Task DB (and the job ad). ## TODO: Do we really need this classification at all? cmscp and PostJob read ## the FJR to know if an output file is EDM, TFile or other. edmfiles, tfiles = self.cmsswCfg.outputFiles() ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile ## output files that are not listed in JobType.outputFiles. if getattr(self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue('JobType.disableAutomaticOutputCollection')): outputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', [])] edmfiles = [file for file in edmfiles if file in outputFiles] tfiles = [file for file in tfiles if file in outputFiles] ## Get the list of additional output files that have to be collected as given ## in JobType.outputFiles, but remove duplicates listed already as EDM files or ## TFiles. addoutputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles+tfiles] self.logger.debug("The following EDM output files will be collected: %s" % edmfiles) self.logger.debug("The following TFile output files will be collected: %s" % tfiles) self.logger.debug("The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) # Write out CMSSW config self.cmsswCfg.writeFile(cfgOutputName) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', [])] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [os.path.basename(f) for f in inputFiles] uploadResults = tb.upload(filecacheurl = filecacheurl) self.logger.debug("Result uploading input files: %s " % str(uploadResults)) configArguments['cacheurl'] = filecacheurl configArguments['cachefilename'] = uploadResults[0] isbchecksum = uploadResults[1] # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, 'userInputFiles', None) if userFilesList: self.logger.debug("Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments['userfiles'] = set(userFilesList) ## Get the user-specified primary dataset name. primaryDataset = getattr(self.config.Data, 'primaryDataset', 'CRAB_UserFiles') # Normalizes "foo/bar" and "/foo/bar" to "/foo/bar" primaryDataset = "/" + os.path.join(*primaryDataset.split("/")) if not re.match("/%(primDS)s.*" % (lfnParts), primaryDataset): self.logger.warning("Invalid primary dataset name %s; publication may fail." % (primaryDataset)) configArguments['inputdata'] = primaryDataset lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) lumi_list = getLumiList(lumi_mask_name, logger = self.logger) run_ranges = getattr(self.config.Data, 'runRange', None) run_ranges_is_valid = run_ranges is not None and isinstance(run_ranges, str) and re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str(len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs = run_list) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [str(reduce(lambda x,y: x+y, lumi_mask[run]))[1:-1].replace(' ','') for run in configArguments['runs']] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments, isbchecksum