def upload(self): """ Upload the tarball to the Panda Cache """ self.close() archiveName = self.tarfile.name serverUrl = "" self.logger.debug(" uploading archive to cache %s " % archiveName) status,out = PandaInterface.putFile(archiveName, verbose=False, useCacheSrv=True, reuseSandbox=True) if out.startswith('NewFileName:'): # found the same input sandbox to reuse self.logger.debug("out: %s" % out) self.logger.debug("status: %s" % status) self.logger.debug("found the same input sandbox to reuse") archiveName = out.split(':')[-1] serverUrl = "https://%s:%s" % (out.split(':')[-2], '25443') self.logger.debug("archiveName: %s" %archiveName) elif out.startswith('True'): archiveName = out.split(':')[-1] serverUrl = "%s:%s:%s" % (out.split(':')[-4], out.split(':')[-3], out.split(':')[-2]) else: self.logger.error( str(out) ) self.logger.error("failed to upload source files with %s" % status) raise CachefileNotFoundException return serverUrl, archiveName, self.checksum
def upload(self): """ Upload the tarball to the Panda Cache """ self.close() archiveName = self.tarfile.name serverUrl = "" self.logger.debug(" uploading archive to cache %s " % archiveName) #disabling reuseSandbox as checksum is different every time (the pset is written every time and the "last modified" date changes every time) status,out = PandaInterface.putFile(self.config.JobType.baseurl, self.config.JobType.filecacheurl, archiveName, self.checksum, verbose=False, reuseSandbox=False) if out.startswith('NewFileName:'): # found the same input sandbox to reuse self.logger.debug("out: %s" % out) self.logger.debug("status: %s" % status) self.logger.debug("found the same input sandbox to reuse") archiveName = out.split(':')[-1] serverUrl = "https://%s:%s" % (out.split(':')[-2], '25443') self.logger.debug("archiveName: %s" %archiveName) elif out.startswith('True'): archiveName = out.split(':')[-1] serverUrl = "%s:%s:%s" % (out.split(':')[-4], out.split(':')[-3], out.split(':')[-2]) else: self.logger.error( str(out) ) self.logger.error("failed to upload source files with %s" % status) raise CachefileNotFoundException return serverUrl, archiveName, self.checksum
def status(self, workflow, userdn, userproxy=None): """Retrieve the status of the workflow. :arg str workflow: a valid workflow name :return: a workflow status summary document""" self.logger.debug("Getting status for workflow %s" % workflow) row = self.api.query(None, None, ID.sql, taskname = workflow) _, jobsetid, status, vogroup, vorole, taskFailure, splitArgs, resJobs, saveLogs = row.next() #just one row is picked up by the previous query resJobs = literal_eval(resJobs.read()) self.logger.info("Status result for workflow %s: %s. JobsetID: %s" % (workflow, status, jobsetid)) self.logger.debug("User vogroup=%s and user vorole=%s" % (vogroup, vorole)) rows = self.api.query(None, None, GetJobGroupFromID.sql, taskname = workflow) jobsPerStatus = {} jobList = [] totalJobdefs = 0 failedJobdefs = 0 jobDefErrs = [] for jobdef in rows: jobdefid = jobdef[0] jobdefStatus = jobdef[1] jobdefError = jobdef[2].read() if jobdef[2] else '' totalJobdefs += 1 self.logger.debug("DB Status for jobdefid %s is %s. %s" % (jobdefid, jobdefStatus, jobdefError)) #check if the taskworker succeded in the submission if jobdefStatus == 'FAILED': jobDefErrs.append(jobdefError) failedJobdefs += 1 continue #check the status of the jobdef in panda schedEC, res = pserver.getPandIDsWithJobID(self.centralcfg.centralconfig['backend-urls']['baseURLSSL'], jobID=jobdefid, dn=userdn, userproxy=userproxy, credpath=self.credpath) self.logger.debug("Status for jobdefid %s: %s" % (jobdefid, schedEC)) if schedEC: jobDefErrs.append("Cannot get information for jobdefid %s. Panda server error: %s" % (jobdefid, schedEC)) self.logger.debug(jobDefErrs[-1]) failedJobdefs += 1 continue #prepare the result at the job granularity self.logger.debug("Iterating on: %s" % res) for jobid, (jobstatus, _) in res.iteritems(): if jobid not in resJobs: jobsPerStatus[jobstatus] = jobsPerStatus[jobstatus]+1 if jobstatus in jobsPerStatus else 1 jobList.append((jobstatus,jobid)) if set(jobsPerStatus): #if we actually submitted something status = self._updateTaskStatus(workflow, status, jobsPerStatus) return [ {"status" : status,\ "taskFailureMsg" : taskFailure.read() if taskFailure else '',\ "jobSetID" : jobsetid if jobsetid else '',\ "jobsPerStatus" : jobsPerStatus,\ "failedJobdefs" : failedJobdefs,\ "totalJobdefs" : totalJobdefs,\ "jobdefErrors" : jobDefErrs,\ "jobList" : jobList,\ "saveLogs" : saveLogs }]
def run(self, filecacheurl = None): """ Override run() for JobType """ configArguments = {'addoutputfiles' : [], 'adduserfiles' : [], 'tfileoutfiles' : [], 'edmoutfiles' : [], } if getattr(self.config.Data, 'useParent', False) and getattr(self.config.Data, 'secondaryInputDataset', None): msg = "Invalid CRAB configuration: Parameters Data.useParent and Data.secondaryInputDataset cannot be used together." raise ConfigurationException(msg) # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({'jobarch': scram.getScramArch(), 'jobsw': scram.getCmsswVersion()}) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID + 'default.tgz') debugTarFilename = os.path.join(self.workdir, 'debugFiles.tgz') cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE) else: raise EnvironmentException('Problem with uuidgen while preparing for Sandbox upload.') else: _, tarFilename = tempfile.mkstemp(suffix='.tgz') _, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset ## Create CMSSW config. self.logger.debug("self.config: %s" % (self.config)) self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName)) ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent ## in the sense that a second loading of the same pset may not produce the same ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW ## pset twice. However, some "complicated" psets seem to evade the caching. ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that ## it can be reused later if wanted (for example, in PrivateMC when checking if ## the pset has an LHE source) instead of having to load the pset again. ## As for what does "complicated" psets mean, Daniel Riley said that there are ## some psets where one module modifies the configuration from another module. self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## If there is a CMSSW pset, do a basic validation of it. if not bootstrapDone() and self.config.JobType.psetName: valid, msg = self.cmsswCfg.validateConfig() if not valid: raise ConfigurationException(msg) ## We need to put the pickled CMSSW configuration in the right place. ## Here, we determine if the bootstrap script already run and prepared everything ## for us. In such case we move the file, otherwise we pickle.dump the pset if not bootstrapDone(): # Write out CMSSW config self.cmsswCfg.writeFile(cfgOutputName) else: # Move the pickled and the configuration files created by the bootstrap script self.moveCfgFile(cfgOutputName) ## Interrogate the CMSSW pset for output files (only output files produced by ## PoolOutputModule or TFileService are identified automatically). Do this ## automatic detection even if JobType.disableAutomaticOutputCollection = True, ## so that we can still classify the output files in EDM, TFile and additional ## output files in the Task DB (and the job ad). ## TODO: Do we really need this classification at all? cmscp and PostJob read ## the FJR to know if an output file is EDM, TFile or other. edmfiles, tfiles = self.cmsswCfg.outputFiles() ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile ## output files that are not listed in JobType.outputFiles. if getattr(self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue('JobType.disableAutomaticOutputCollection')): outputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', [])] edmfiles = [file for file in edmfiles if file in outputFiles] tfiles = [file for file in tfiles if file in outputFiles] ## Get the list of additional output files that have to be collected as given ## in JobType.outputFiles, but remove duplicates listed already as EDM files or ## TFiles. addoutputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles+tfiles] self.logger.debug("The following EDM output files will be collected: %s" % edmfiles) self.logger.debug("The following TFile output files will be collected: %s" % tfiles) self.logger.debug("The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) ## Give warning message in case no output file was detected in the CMSSW pset ## nor was any specified in the CRAB configuration. if not configArguments['edmoutfiles'] and not configArguments['tfileoutfiles'] and not configArguments['addoutputfiles']: msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) if getattr(self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue('JobType.disableAutomaticOutputCollection')): msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration" msg += " and no output file was explicitly specified in the CRAB configuration." else: msg += " CRAB could not detect any output file in the CMSSW configuration" msg += " nor was any explicitly specified in the CRAB configuration." msg += " Hence CRAB will not collect any output file from this task." self.logger.warning(msg) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', [])] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [os.path.basename(f) for f in inputFiles] try: uploadResult = tb.upload(filecacheurl = filecacheurl) except HTTPException as hte: if 'X-Error-Info' in hte.headers: reason = hte.headers['X-Error-Info'] reason_re = re.compile(r'\AFile size is ([0-9]*)B\. This is bigger than the maximum allowed size of ([0-9]*)B\.$') re_match = reason_re.match(reason) if re_match: ISBSize = int(re_match.group(1)) ISBSizeLimit = int(re_match.group(2)) reason = "%sError%s:" % (colors.RED, colors.NORMAL) reason += " Input sanbox size is ~%sMB. This is bigger than the maximum allowed size of %sMB." % (ISBSize/1024/1024, ISBSizeLimit/1024/1024) ISBContent = sorted(tb.content, reverse=True) biggestFileSize = ISBContent[0][0] ndigits = int(math.ceil(math.log(biggestFileSize+1, 10))) reason += "\nInput sanbox content sorted by size[Bytes]:" for (size, name) in ISBContent: reason += ("\n%" + str(ndigits) + "s\t%s") % (size, name) raise ClientException(reason) raise hte except Exception as e: msg = ("Impossible to calculate the checksum of the sandbox tarball.\nError message: %s.\n" "More details can be found in %s" % (e, self.logger.logfile)) LOGGERS['CRAB3'].exception(msg) #the traceback is only printed into the logfile raise ClientException(msg) debugFilesUploadResult = None with UserTarball(name=debugTarFilename, logger=self.logger, config=self.config) as dtb: dtb.addMonFiles() try: debugFilesUploadResult = dtb.upload(filecacheurl = filecacheurl) except Exception as e: msg = ("Problem uploading debug_files.tar.gz.\nError message: %s.\n" "More details can be found in %s" % (e, self.logger.logfile)) LOGGERS['CRAB3'].exception(msg) #the traceback is only printed into the logfile configArguments['cacheurl'] = filecacheurl configArguments['cachefilename'] = "%s.tar.gz" % uploadResult if debugFilesUploadResult is not None: configArguments['debugfilename'] = "%s.tar.gz" % debugFilesUploadResult self.logger.debug("Result uploading input files: %(cachefilename)s " % configArguments) # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, 'userInputFiles', None) if userFilesList: self.logger.debug("Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments['userfiles'] = set(userFilesList) configArguments['primarydataset'] = getattr(self.config.Data, 'outputPrimaryDataset', 'CRAB_UserFiles') lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) try: lumi_list = getLumiList(lumi_mask_name, logger = self.logger) except ValueError as ex: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex) raise ConfigurationException(msg) run_ranges = getattr(self.config.Data, 'runRange', None) if run_ranges: run_ranges_is_valid = re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) if not lumi_list: msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null." raise ConfigurationException(msg) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str(len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs = run_list) else: msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'" raise ConfigurationException(msg) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [str(reduce(lambda x,y: x+y, lumi_mask[run]))[1:-1].replace(' ','') for run in configArguments['runs']] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments
def run(self, requestConfig): """ Override run() for JobType """ configArguments = {'addoutputfiles' : [], 'adduserfiles' : [], 'tfileoutfiles' : [], 'edmoutfiles' : [], } # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({'jobarch' : scram.scramArch, 'jobsw' : scram.cmsswVersion, }) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID +'default.tgz') cfgOutputName = os.path.join(self.workdir, 'CMSSW_cfg.py') else: raise EnvironmentException('Problem with uuidgen while preparing for Sandbox upload.') else: _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz') _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') #configArguments['userisburl'] = 'https://'+ self.config.General.ufccacheUrl + '/crabcache/file?hashkey=' + uploadResults['hashkey']#XXX hardcoded #configArguments['userisburl'] = 'INSERTuserisburl'#XXX hardcoded if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset # configArguments['ProcessingVersion'] = getattr(self.config.Data, 'processingVersion', None) # Create CMSSW config self.logger.debug("self.config: %s" % self.config) self.logger.debug("self.config.JobType.psetName: %s" % self.config.JobType.psetName) cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) # Interogate CMSSW config and user config for output file names, for now no use for edmFiles or TFiles here. analysisFiles, edmFiles = cmsswCfg.outputFiles() self.logger.debug("TFiles %s and EDM Files %s will be collected" % (analysisFiles, edmFiles)) configArguments['tfileoutfiles'] = analysisFiles configArguments['edmoutfiles'] = edmFiles outputFiles = getattr(self.config.JobType, 'outputFiles', []) self.logger.debug("User files %s will be collected" % outputFiles) configArguments['addoutputfiles'].extend(outputFiles) # Write out CMSSW config cmsswCfg.writeFile(cfgOutputName) with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = getattr(self.config.JobType, 'inputFiles', []) tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [os.path.basename(f) for f in inputFiles] uploadResults = tb.upload() self.logger.debug("Result uploading input files: %s " % str(uploadResults)) configArguments['cachefilename'] = uploadResults[1] configArguments['cacheurl'] = uploadResults[0] isbchecksum = uploadResults[2] # Upload lumi mask if it exists lumiMaskName = getattr(self.config.Data, 'lumiMask', None) if lumiMaskName: self.logger.debug("Attaching lumi mask %s to the request" % lumiMaskName) lumiDict = getLumiMask(config=self.config, logger=self.logger) configArguments['runs'] = lumiDict.keys() #for each run we'll encode the lumis as a string representing a list of integers #[[1,2],[5,5]] ==> '1,2,5,5' configArguments['lumis'] = [ str(reduce(lambda x,y: x+y, \ lumiDict[run]))[1:-1].replace(' ','') \ for run in configArguments['runs'] ] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments, isbchecksum
def run(self, filecacheurl=None): """ Override run() for JobType """ configArguments = { 'addoutputfiles': [], 'adduserfiles': [], 'tfileoutfiles': [], 'edmoutfiles': [], } if getattr(self.config.Data, 'useParent', False) and getattr( self.config.Data, 'secondaryInputDataset', None): msg = "Invalid CRAB configuration: Parameters Data.useParent and Data.secondaryInputDataset cannot be used together." raise ConfigurationException(msg) # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({ 'jobarch': scram.getScramArch(), 'jobsw': scram.getCmsswVersion() }) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID + 'default.tgz') cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE) else: raise EnvironmentException( 'Problem with uuidgen while preparing for Sandbox upload.') else: _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz') _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset ## Create CMSSW config. self.logger.debug("self.config: %s" % (self.config)) self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName)) ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent ## in the sense that a second loading of the same pset may not produce the same ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW ## pset twice. However, some "complicated" psets seem to evade the caching. ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that ## it can be reused later if wanted (for example, in PrivateMC when checking if ## the pset has an LHE source) instead of having to load the pset again. ## As for what does "complicated" psets mean, Daniel Riley said that there are ## some psets where one module modifies the configuration from another module. self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## If there is a CMSSW pset, do a basic validation of it. if not bootstrapDone() and self.config.JobType.psetName: valid, msg = self.cmsswCfg.validateConfig() if not valid: raise ConfigurationException(msg) ## We need to put the pickled CMSSW configuration in the right place. ## Here, we determine if the bootstrap script already run and prepared everything ## for us. In such case we move the file, otherwise we pickle.dump the pset if not bootstrapDone(): # Write out CMSSW config self.cmsswCfg.writeFile(cfgOutputName) else: # Move the pickled and the configuration files created by the bootstrap script self.moveCfgFile(cfgOutputName) ## Interrogate the CMSSW pset for output files (only output files produced by ## PoolOutputModule or TFileService are identified automatically). Do this ## automatic detection even if JobType.disableAutomaticOutputCollection = True, ## so that we can still classify the output files in EDM, TFile and additional ## output files in the Task DB (and the job ad). ## TODO: Do we really need this classification at all? cmscp and PostJob read ## the FJR to know if an output file is EDM, TFile or other. edmfiles, tfiles = self.cmsswCfg.outputFiles() ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile ## output files that are not listed in JobType.outputFiles. if getattr( self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue( 'JobType.disableAutomaticOutputCollection')): outputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) ] edmfiles = [file for file in edmfiles if file in outputFiles] tfiles = [file for file in tfiles if file in outputFiles] ## Get the list of additional output files that have to be collected as given ## in JobType.outputFiles, but remove duplicates listed already as EDM files or ## TFiles. addoutputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles + tfiles ] self.logger.debug( "The following EDM output files will be collected: %s" % edmfiles) self.logger.debug( "The following TFile output files will be collected: %s" % tfiles) self.logger.debug( "The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) ## Give warning message in case no output file was detected in the CMSSW pset ## nor was any specified in the CRAB configuration. if not configArguments['edmoutfiles'] and not configArguments[ 'tfileoutfiles'] and not configArguments['addoutputfiles']: msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) if getattr( self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue( 'JobType.disableAutomaticOutputCollection')): msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration" msg += " and no output file was explicitly specified in the CRAB configuration." else: msg += " CRAB could not detect any output file in the CMSSW configuration" msg += " nor was any explicitly specified in the CRAB configuration." msg += " Hence CRAB will not collect any output file from this task." self.logger.warning(msg) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', []) ] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [ os.path.basename(f) for f in inputFiles ] try: uploadResult = tb.upload(filecacheurl=filecacheurl) except HTTPException as hte: if 'X-Error-Info' in hte.headers: reason = hte.headers['X-Error-Info'] reason_re = re.compile( r'\AFile size is ([0-9]*)B\. This is bigger than the maximum allowed size of ([0-9]*)B\.$' ) re_match = reason_re.match(reason) if re_match: ISBSize = int(re_match.group(1)) ISBSizeLimit = int(re_match.group(2)) reason = "%sError%s:" % (colors.RED, colors.NORMAL) reason += " Input sanbox size is ~%sMB. This is bigger than the maximum allowed size of %sMB." % ( ISBSize / 1024 / 1024, ISBSizeLimit / 1024 / 1024) ISBContent = sorted(tb.content, reverse=True) biggestFileSize = ISBContent[0][0] ndigits = int( math.ceil(math.log(biggestFileSize + 1, 10))) reason += "\nInput sanbox content sorted by size[Bytes]:" for (size, name) in ISBContent: reason += ("\n%" + str(ndigits) + "s\t%s") % (size, name) raise ClientException(reason) raise hte except Exception as e: msg = ( "Impossible to calculate the checksum of the sandbox tarball.\nError message: %s.\n" "More details can be found in %s" % (e, self.logger.logfile)) LOGGERS['CRAB3'].exception( msg) #the traceback is only printed into the logfile raise ClientException(msg) configArguments['cacheurl'] = filecacheurl configArguments['cachefilename'] = "%s.tar.gz" % uploadResult self.logger.debug("Result uploading input files: %(cachefilename)s " % configArguments) # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, 'userInputFiles', None) if userFilesList: self.logger.debug( "Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments['userfiles'] = set(userFilesList) configArguments['primarydataset'] = getattr( self.config.Data, 'outputPrimaryDataset', 'CRAB_UserFiles') lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) try: lumi_list = getLumiList(lumi_mask_name, logger=self.logger) except ValueError as ex: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex) raise ConfigurationException(msg) run_ranges = getattr(self.config.Data, 'runRange', None) if run_ranges: run_ranges_is_valid = re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) if not lumi_list: msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null." raise ConfigurationException(msg) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str( len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs=run_list) else: msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'" raise ConfigurationException(msg) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [ str(reduce(lambda x, y: x + y, lumi_mask[run]))[1:-1].replace(' ', '') for run in configArguments['runs'] ] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments
def run(self, requestConfig): """ Override run() for JobType """ configArguments = {'addoutputfiles' : [], 'adduserfiles' : [], 'tfileoutfiles' : [], 'edmoutfiles' : [], } # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({'jobarch' : scram.scramArch, 'jobsw' : scram.cmsswVersion, }) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID +'default.tgz') cfgOutputName = os.path.join(self.workdir, 'CMSSW_cfg.py') else: raise EnvironmentException('Problem with uuidgen while preparing for Sandbox upload.') else: _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz') _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') #configArguments['userisburl'] = 'https://'+ self.config.General.ufccacheUrl + '/crabcache/file?hashkey=' + uploadResults['hashkey']#XXX hardcoded #configArguments['userisburl'] = 'INSERTuserisburl'#XXX hardcoded if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset # configArguments['ProcessingVersion'] = getattr(self.config.Data, 'processingVersion', None) # Create CMSSW config self.logger.debug("self.config: %s" % self.config) self.logger.debug("self.config.JobType.psetName: %s" % self.config.JobType.psetName) cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## Interogate CMSSW config and user config for output file names. For now no use for EDM files or TFiles here. edmfiles, tfiles = cmsswCfg.outputFiles() addoutputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles+tfiles] self.logger.debug("The following EDM output files will be collected: %s" % edmfiles) self.logger.debug("The following TFile output files will be collected: %s" % tfiles) self.logger.debug("The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) # Write out CMSSW config cmsswCfg.writeFile(cfgOutputName) with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', [])] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [os.path.basename(f) for f in inputFiles] uploadResults = tb.upload() self.logger.debug("Result uploading input files: %s " % str(uploadResults)) configArguments['cachefilename'] = uploadResults[1] configArguments['cacheurl'] = uploadResults[0] isbchecksum = uploadResults[2] # Upload list of user-defined input files to process as the primary input userFileName = getattr(self.config.Data, 'userInputFile', None) if userFileName: self.logger.debug("Attaching a list of user-specified primary input files from %s." % userFileName) fnames = [] for fname in open(userFileName).readlines(): fnames.append(fname.strip()) configArguments['userfiles'] = filter(lambda x: x, fnames) #removing whitelines and empty objects primDS = getattr(self.config.Data, 'primaryDataset', None) if primDS: # Normalizes "foo/bar" and "/foo/bar" to "/foo/bar" primDS = "/" + os.path.join(*primDS.split("/")) if not re.match("/%(primDS)s.*" % lfnParts, primDS): self.logger.warning("Invalid primary dataset name %s for private MC; publishing may fail" % primDS) configArguments['inputdata'] = primDS else: configArguments['inputdata'] = getattr(self.config.Data, 'inputDataset', '/CRAB_UserFiles') lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % lumi_mask_name) lumi_list = getLumiList(lumi_mask_name, logger = self.logger) run_ranges = getattr(self.config.Data, 'runRange', None) run_ranges_is_valid = run_ranges is not None and isinstance(run_ranges, str) and re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) else: if len(run_list) > 50000: msg = "Data.runRange includes %s runs." % str(len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs = run_list) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [str(reduce(lambda x,y: x+y, lumi_mask[run]))[1:-1].replace(' ','') for run in configArguments['runs']] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments, isbchecksum
def run(self, filecacheurl=None): """ Override run() for JobType """ configArguments = {"addoutputfiles": [], "adduserfiles": [], "tfileoutfiles": [], "edmoutfiles": []} # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({"jobarch": scram.getScramArch(), "jobsw": scram.getCmsswVersion()}) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug("UNIQUE NAME: tarUUID %s " % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID + "default.tgz") cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE) else: raise EnvironmentException("Problem with uuidgen while preparing for Sandbox upload.") else: _dummy, tarFilename = tempfile.mkstemp(suffix=".tgz") _dummy, cfgOutputName = tempfile.mkstemp(suffix="_cfg.py") if getattr(self.config.Data, "inputDataset", None): configArguments["inputdata"] = self.config.Data.inputDataset ## Create CMSSW config. self.logger.debug("self.config: %s" % (self.config)) self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName)) ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent ## in the sense that a second loading of the same pset may not produce the same ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW ## pset twice. However, some "complicated" psets seem to evade the caching. ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that ## it can be reused later if wanted (for example, in PrivateMC when checking if ## the pset has an LHE source) instead of having to load the pset again. ## As for what does "complicated" psets mean, Daniel Riley said that there are ## some psets where one module modifies the configuration from another module. self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## If there is a CMSSW pset, do a basic validation of it. if not bootstrapDone() and self.config.JobType.psetName: valid, msg = self.cmsswCfg.validateConfig() if not valid: raise ConfigurationException(msg) ## We need to put the pickled CMSSW configuration in the right place. ## Here, we determine if the bootstrap script already run and prepared everything ## for us. In such case we move the file, otherwise we pickle.dump the pset if not bootstrapDone(): # Write out CMSSW config self.cmsswCfg.writeFile(cfgOutputName) else: # Move the pickled configuration file created by the bootstrap script self.moveCfgFile(cfgOutputName) ## Interrogate the CMSSW pset for output files (only output files produced by ## PoolOutputModule or TFileService are identified automatically). Do this ## automatic detection even if JobType.disableAutomaticOutputCollection = True, ## so that we can still classify the output files in EDM, TFile and additional ## output files in the Task DB (and the job ad). ## TODO: Do we really need this classification at all? cmscp and PostJob read ## the FJR to know if an output file is EDM, TFile or other. edmfiles, tfiles = self.cmsswCfg.outputFiles() ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile ## output files that are not listed in JobType.outputFiles. if getattr( self.config.JobType, "disableAutomaticOutputCollection", getParamDefaultValue("JobType.disableAutomaticOutputCollection"), ): outputFiles = [re.sub(r"^file:", "", file) for file in getattr(self.config.JobType, "outputFiles", [])] edmfiles = [file for file in edmfiles if file in outputFiles] tfiles = [file for file in tfiles if file in outputFiles] ## Get the list of additional output files that have to be collected as given ## in JobType.outputFiles, but remove duplicates listed already as EDM files or ## TFiles. addoutputFiles = [ re.sub(r"^file:", "", file) for file in getattr(self.config.JobType, "outputFiles", []) if re.sub(r"^file:", "", file) not in edmfiles + tfiles ] self.logger.debug("The following EDM output files will be collected: %s" % edmfiles) self.logger.debug("The following TFile output files will be collected: %s" % tfiles) self.logger.debug("The following user output files will be collected: %s" % addoutputFiles) configArguments["edmoutfiles"] = edmfiles configArguments["tfileoutfiles"] = tfiles configArguments["addoutputfiles"].extend(addoutputFiles) ## Give warning message in case no output file was detected in the CMSSW pset ## nor was any specified in the CRAB configuration. if ( not configArguments["edmoutfiles"] and not configArguments["tfileoutfiles"] and not configArguments["addoutputfiles"] ): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) if getattr( self.config.JobType, "disableAutomaticOutputCollection", getParamDefaultValue("JobType.disableAutomaticOutputCollection"), ): msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration" msg += " and no output file was explicitly specified in the CRAB configuration." else: msg += " CRAB could not detect any output file in the CMSSW configuration" msg += " nor was any explicitly specified in the CRAB configuration." msg += " Hence CRAB will not collect any output file from this task." self.logger.warning(msg) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [re.sub(r"^file:", "", file) for file in getattr(self.config.JobType, "inputFiles", [])] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments["adduserfiles"] = [os.path.basename(f) for f in inputFiles] uploadResults = tb.upload(filecacheurl=filecacheurl) self.logger.debug("Result uploading input files: %s " % str(uploadResults)) configArguments["cacheurl"] = filecacheurl configArguments["cachefilename"] = uploadResults[0] isbchecksum = uploadResults[1] # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, "userInputFiles", None) if userFilesList: self.logger.debug("Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments["userfiles"] = set(userFilesList) ## Get the user-specified primary dataset name. primaryDataset = getattr(self.config.Data, "primaryDataset", "CRAB_UserFiles") # Normalizes "foo/bar" and "/foo/bar" to "/foo/bar" primaryDataset = "/" + os.path.join(*primaryDataset.split("/")) if not re.match("/%(primDS)s.*" % (lfnParts), primaryDataset): self.logger.warning("Invalid primary dataset name %s; publication may fail." % (primaryDataset)) configArguments["inputdata"] = primaryDataset lumi_mask_name = getattr(self.config.Data, "lumiMask", None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) try: lumi_list = getLumiList(lumi_mask_name, logger=self.logger) except ValueError as ex: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex) raise ConfigurationException(msg) run_ranges = getattr(self.config.Data, "runRange", None) if run_ranges: run_ranges_is_valid = re.match("^\d+((?!(-\d+-))(\,|\-)\d+)*$", run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) if not lumi_list: msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null." raise ConfigurationException(msg) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str(len(run_list)) msg += ( " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." ) raise ConfigurationException(msg) lumi_list = LumiList(runs=run_list) else: msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'" raise ConfigurationException(msg) if lumi_list: configArguments["runs"] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments["lumis"] = [ str(reduce(lambda x, y: x + y, lumi_mask[run]))[1:-1].replace(" ", "") for run in configArguments["runs"] ] configArguments["jobtype"] = "Analysis" return tarFilename, configArguments, isbchecksum
def run(self, filecacheurl = None): """ Override run() for JobType """ configArguments = {'addoutputfiles' : [], 'adduserfiles' : [], 'tfileoutfiles' : [], 'edmoutfiles' : [], } # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({'jobarch' : scram.scramArch, 'jobsw' : scram.cmsswVersion, }) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID +'default.tgz') cfgOutputName = os.path.join(self.workdir, 'CMSSW_cfg.py') else: raise EnvironmentException('Problem with uuidgen while preparing for Sandbox upload.') else: _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz') _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset # configArguments['ProcessingVersion'] = getattr(self.config.Data, 'processingVersion', None) # Create CMSSW config self.logger.debug("self.config: %s" % self.config) self.logger.debug("self.config.JobType.psetName: %s" % self.config.JobType.psetName) cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## Interogate CMSSW config and user config for output file names. For now no use for EDM files or TFiles here. edmfiles, tfiles = cmsswCfg.outputFiles() addoutputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles+tfiles] self.logger.debug("The following EDM output files will be collected: %s" % edmfiles) self.logger.debug("The following TFile output files will be collected: %s" % tfiles) self.logger.debug("The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) # Write out CMSSW config cmsswCfg.writeFile(cfgOutputName) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', [])] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [os.path.basename(f) for f in inputFiles] uploadResults = tb.upload(filecacheurl = filecacheurl) self.logger.debug("Result uploading input files: %s " % str(uploadResults)) configArguments['cacheurl'] = filecacheurl configArguments['cachefilename'] = uploadResults[0] isbchecksum = uploadResults[1] # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, 'userInputFiles', None) if userFilesList: self.logger.debug("Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s: CRAB configuration parameter Data.userInputFiles contains duplicated entries." % (colors.RED, colors.NORMAL) msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments['userfiles'] = set(userFilesList) ## Get the user-specified primary dataset name. primaryDataset = getattr(self.config.Data, 'primaryDataset', 'CRAB_UserFiles') # Normalizes "foo/bar" and "/foo/bar" to "/foo/bar" primaryDataset = "/" + os.path.join(*primaryDataset.split("/")) if not re.match("/%(primDS)s.*" % (lfnParts), primaryDataset): self.logger.warning("Invalid primary dataset name %s; publication may fail." % (primaryDataset)) configArguments['inputdata'] = primaryDataset lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % lumi_mask_name) lumi_list = getLumiList(lumi_mask_name, logger = self.logger) run_ranges = getattr(self.config.Data, 'runRange', None) run_ranges_is_valid = run_ranges is not None and isinstance(run_ranges, str) and re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) else: if len(run_list) > 50000: msg = "Data.runRange includes %s runs." % str(len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs = run_list) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [str(reduce(lambda x,y: x+y, lumi_mask[run]))[1:-1].replace(' ','') for run in configArguments['runs']] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments, isbchecksum
def run(self, requestConfig): """ Override run() for JobType """ configArguments = { 'addoutputfiles': [], 'adduserfiles': [], 'tfileoutfiles': [], 'edmoutfiles': [], } # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({ 'jobarch': scram.scramArch, 'jobsw': scram.cmsswVersion, }) # Build tarball if self.workdir: tarFilename = os.path.join( self.workdir, PandaInterface.wrappedUuidGen() + 'default.tgz') cfgOutputName = os.path.join(self.workdir, 'CMSSW_cfg.py') else: _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz') _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') #configArguments['userisburl'] = 'https://'+ self.config.General.ufccacheUrl + '/crabcache/file?hashkey=' + uploadResults['hashkey']#XXX hardcoded #configArguments['userisburl'] = 'INSERTuserisburl'#XXX hardcoded if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset # configArguments['ProcessingVersion'] = getattr(self.config.Data, 'processingVersion', None) # Create CMSSW config self.logger.debug("self.config: %s" % self.config) self.logger.debug("self.config.JobType.psetName: %s" % self.config.JobType.psetName) cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) # Interogate CMSSW config and user config for output file names, for now no use for edmFiles or TFiles here. analysisFiles, edmFiles = cmsswCfg.outputFiles() self.logger.debug("TFiles %s and EDM Files %s will be collected" % (analysisFiles, edmFiles)) configArguments['tfileoutfiles'] = analysisFiles configArguments['edmoutfiles'] = edmFiles outputFiles = getattr(self.config.JobType, 'outputFiles', []) self.logger.debug("User files %s will be collected" % outputFiles) configArguments['addoutputfiles'].extend(outputFiles) # Write out CMSSW config cmsswCfg.writeFile(cfgOutputName) with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = getattr(self.config.JobType, 'inputFiles', []) tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [ os.path.basename(f) for f in inputFiles ] uploadResults = tb.upload() self.logger.debug("Result uploading input files: %s " % str(uploadResults)) configArguments['cachefilename'] = uploadResults[1] configArguments['cacheurl'] = uploadResults[0] isbchecksum = uploadResults[2] # Upload lumi mask if it exists lumiMaskName = getattr(self.config.Data, 'lumiMask', None) if lumiMaskName: self.logger.debug("Attaching lumi mask %s to the request" % lumiMaskName) lumiDict = getLumiMask(config=self.config, logger=self.logger) configArguments['runs'] = lumiDict.keys() #for each run we'll encode the lumis as a string representing a list of integers #[[1,2],[5,5]] ==> '1,2,5,5' configArguments['lumis'] = [ str(reduce(lambda x,y: x+y, \ lumiDict[run]))[1:-1].replace(' ','') \ for run in configArguments['runs'] ] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments, isbchecksum
def run(self, requestConfig): """ Override run() for JobType """ configArguments = { 'addoutputfiles': [], 'adduserfiles': [], 'tfileoutfiles': [], 'edmoutfiles': [], } # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({ 'jobarch': scram.scramArch, 'jobsw': scram.cmsswVersion, }) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID + 'default.tgz') cfgOutputName = os.path.join(self.workdir, 'CMSSW_cfg.py') else: raise EnvironmentException( 'Problem with uuidgen while preparing for Sandbox upload.') else: _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz') _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') #configArguments['userisburl'] = 'https://'+ self.config.General.ufccacheUrl + '/crabcache/file?hashkey=' + uploadResults['hashkey']#XXX hardcoded #configArguments['userisburl'] = 'INSERTuserisburl'#XXX hardcoded if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset # configArguments['ProcessingVersion'] = getattr(self.config.Data, 'processingVersion', None) # Create CMSSW config self.logger.debug("self.config: %s" % self.config) self.logger.debug("self.config.JobType.psetName: %s" % self.config.JobType.psetName) cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## Interogate CMSSW config and user config for output file names. For now no use for EDM files or TFiles here. edmfiles, tfiles = cmsswCfg.outputFiles() addoutputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles + tfiles ] self.logger.debug( "The following EDM output files will be collected: %s" % edmfiles) self.logger.debug( "The following TFile output files will be collected: %s" % tfiles) self.logger.debug( "The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) # Write out CMSSW config cmsswCfg.writeFile(cfgOutputName) with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', []) ] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [ os.path.basename(f) for f in inputFiles ] uploadResults = tb.upload() self.logger.debug("Result uploading input files: %s " % str(uploadResults)) configArguments['cachefilename'] = uploadResults[1] configArguments['cacheurl'] = uploadResults[0] isbchecksum = uploadResults[2] # Upload list of user-defined input files to process as the primary input userFileName = getattr(self.config.Data, 'userInputFile', None) if userFileName: self.logger.debug( "Attaching a list of user-specified primary input files from %s." % userFileName) fnames = [] for fname in open(userFileName).readlines(): fnames.append(fname.strip()) configArguments['userfiles'] = filter( lambda x: x, fnames) #removing whitelines and empty objects primDS = getattr(self.config.Data, 'primaryDataset', None) if primDS: # Normalizes "foo/bar" and "/foo/bar" to "/foo/bar" primDS = "/" + os.path.join(*primDS.split("/")) if not re.match("/%(primDS)s.*" % lfnParts, primDS): self.logger.warning( "Invalid primary dataset name %s for private MC; publishing may fail" % primDS) configArguments['inputdata'] = primDS else: configArguments['inputdata'] = getattr(self.config.Data, 'inputDataset', '/CRAB_UserFiles') lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % lumi_mask_name) lumi_list = getLumiList(lumi_mask_name, logger=self.logger) run_ranges = getattr(self.config.Data, 'runRange', None) run_ranges_is_valid = run_ranges is not None and isinstance( run_ranges, str) and re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) else: if len(run_list) > 50000: msg = "Data.runRange includes %s runs." % str( len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs=run_list) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [ str(reduce(lambda x, y: x + y, lumi_mask[run]))[1:-1].replace(' ', '') for run in configArguments['runs'] ] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments, isbchecksum
def run(self, filecacheurl = None): """ Override run() for JobType """ configArguments = {'addoutputfiles' : [], 'adduserfiles' : [], 'tfileoutfiles' : [], 'edmoutfiles' : [], } # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({'jobarch' : scram.scramArch, 'jobsw' : scram.cmsswVersion, }) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID +'default.tgz') cfgOutputName = os.path.join(self.workdir, 'CMSSW_cfg.py') else: raise EnvironmentException('Problem with uuidgen while preparing for Sandbox upload.') else: _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz') _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset # configArguments['ProcessingVersion'] = getattr(self.config.Data, 'processingVersion', None) ## Create CMSSW config. self.logger.debug("self.config: %s" % (self.config)) self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName)) ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent ## in the sense that a second loading of the same pset may not produce the same ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW ## pset twice. However, some "complicated" psets seem to evade the caching. ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that ## it can be reused later if wanted (for example, in PrivateMC when checking if ## the pset has an LHE source) instead of having to load the pset again. ## As for what does "complicated" psets mean, Daniel Riley said that there are ## some psets where one module modifies the configuration from another module. self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## Interrogate the CMSSW pset for output files (only output files produced by ## PoolOutputModule or TFileService are identified automatically). Do this ## automatic detection even if JobType.disableAutomaticOutputCollection = True, ## so that we can still classify the output files in EDM, TFile and additional ## output files in the Task DB (and the job ad). ## TODO: Do we really need this classification at all? cmscp and PostJob read ## the FJR to know if an output file is EDM, TFile or other. edmfiles, tfiles = self.cmsswCfg.outputFiles() ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile ## output files that are not listed in JobType.outputFiles. if getattr(self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue('JobType.disableAutomaticOutputCollection')): outputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', [])] edmfiles = [file for file in edmfiles if file in outputFiles] tfiles = [file for file in tfiles if file in outputFiles] ## Get the list of additional output files that have to be collected as given ## in JobType.outputFiles, but remove duplicates listed already as EDM files or ## TFiles. addoutputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles+tfiles] self.logger.debug("The following EDM output files will be collected: %s" % edmfiles) self.logger.debug("The following TFile output files will be collected: %s" % tfiles) self.logger.debug("The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) # Write out CMSSW config self.cmsswCfg.writeFile(cfgOutputName) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', [])] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [os.path.basename(f) for f in inputFiles] uploadResults = tb.upload(filecacheurl = filecacheurl) self.logger.debug("Result uploading input files: %s " % str(uploadResults)) configArguments['cacheurl'] = filecacheurl configArguments['cachefilename'] = uploadResults[0] isbchecksum = uploadResults[1] # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, 'userInputFiles', None) if userFilesList: self.logger.debug("Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments['userfiles'] = set(userFilesList) ## Get the user-specified primary dataset name. primaryDataset = getattr(self.config.Data, 'primaryDataset', 'CRAB_UserFiles') # Normalizes "foo/bar" and "/foo/bar" to "/foo/bar" primaryDataset = "/" + os.path.join(*primaryDataset.split("/")) if not re.match("/%(primDS)s.*" % (lfnParts), primaryDataset): self.logger.warning("Invalid primary dataset name %s; publication may fail." % (primaryDataset)) configArguments['inputdata'] = primaryDataset lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) lumi_list = getLumiList(lumi_mask_name, logger = self.logger) run_ranges = getattr(self.config.Data, 'runRange', None) run_ranges_is_valid = run_ranges is not None and isinstance(run_ranges, str) and re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str(len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs = run_list) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [str(reduce(lambda x,y: x+y, lumi_mask[run]))[1:-1].replace(' ','') for run in configArguments['runs']] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments, isbchecksum