def checkAutomaticAvail(self, allowedSplitAlgos): scram = ScramEnvironment(logger=self.logger) major, minor = [ int(v) for v in scram.getCmsswVersion().split('_', 3)[1:-1] ] if major > 7 or (major == 7 and minor >= 2): allowedSplitAlgos.append('Automatic')
def testInit(self): """ Test constructor """ scram = ScramEnvironment(logger=self.testLogger) scram.getCmsswVersion()
def __init__(self, name=None, mode='w:gz', config=None, logger=None): self.config = config self.logger = logger self.scram = ScramEnvironment(logger=self.logger) self.logger.debug("Making tarball in %s" % name) self.tarfile = tarfile.open(name=name, mode=mode, dereference=True) self.checksum = None
def testAccessors(self): """ Test various accessors """ scram = ScramEnvironment(logger=self.testLogger) self.assertEqual(scram.getCmsswVersion(), self.version) self.assertEqual(scram.getScramArch(), self.arch) self.assertEqual(scram.getCmsswBase(), self.base)
def testScram(self): """ Test Scram environment """ msg = "You must set up a CMSSW environment first" scram = ScramEnvironment(logger=self.logger) self.assertNotEqual(scram.getCmsswVersion(), None, msg) self.assertNotEqual(scram.getScramArch(), None, msg) self.assertNotEqual(scram.getCmsswBase(), None, msg)
def __init__(self, name=None, mode='w:gz', config=None, logger=None): self.config = config self.logger = logger self.scram = ScramEnvironment(logger=self.logger) self.logger.debug("Making tarball in %s" % name) self.tarfile = tarfile.open(name=name , mode=mode, dereference=True) self.checksum = None
def __init__(self, name=None, mode='w:gz', config=None, logger=None): self.config = config self.logger = logger self.scram = ScramEnvironment(logger=self.logger) self.logger.debug("Making tarball in %s" % name) self.tarfile = tarfile.open(name=name , mode=mode, dereference=True) self.checksum = None PandaInterface.LOGGER = logging.getLogger('CRAB3:traceback')
class UserTarball(object): """ _UserTarball_ A subclass of TarFile for the user code tarballs. By default creates a new tarball with the user libraries from lib, module, and the data/ and interface/ sections of the src/ area. Also adds user specified files in the right place. """ def __init__(self, name=None, mode='w:gz', config=None, logger=None): self.config = config self.logger = logger self.scram = ScramEnvironment(logger=self.logger) self.logger.debug("Making tarball in %s" % name) self.tarfile = tarfile.open(name=name, mode=mode, dereference=True) self.checksum = None def addFiles(self, userFiles=None, cfgOutputName=None): """ Add the necessary files to the tarball """ directories = ['lib', 'biglib', 'module'] if getattr( self.config.JobType, 'sendPythonFolder', configParametersInfo['JobType.sendPythonFolder']['default']): directories.append('python') # /data/ subdirs contain data files needed by the code # /interface/ subdirs contain C++ header files needed e.g. by ROOT6 dataDirs = ['data', 'interface'] userFiles = userFiles or [] # Tar up whole directories for directory in directories: fullPath = os.path.join(self.scram.getCmsswBase(), directory) self.logger.debug("Checking directory %s" % fullPath) if os.path.exists(fullPath): self.logger.debug("Adding directory %s to tarball" % fullPath) self.checkdirectory(fullPath) self.tarfile.add(fullPath, directory, recursive=True) # Search for and tar up "data" directories in src/ srcPath = os.path.join(self.scram.getCmsswBase(), 'src') for root, _dummy, _dummy in os.walk(srcPath): if os.path.basename(root) in dataDirs: directory = root.replace(srcPath, 'src') self.logger.debug("Adding data directory %s to tarball" % root) self.checkdirectory(root) self.tarfile.add(root, directory, recursive=True) # Tar up extra files the user needs for globName in userFiles: fileNames = glob.glob(globName) if not fileNames: raise InputFileNotFoundException( "The input file '%s' taken from parameter config.JobType.inputFiles cannot be found." % globName) for filename in fileNames: self.logger.debug("Adding file %s to tarball" % filename) self.checkdirectory(filename) self.tarfile.add(filename, os.path.basename(filename), recursive=True) scriptExe = getattr(self.config.JobType, 'scriptExe', None) if scriptExe: self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe)) # Adding the pset files to the tarfile if cfgOutputName: basedir = os.path.dirname(cfgOutputName) self.tarfile.add(cfgOutputName, arcname=BOOTSTRAP_CFGFILE) self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_PKL), arcname=BOOTSTRAP_CFGFILE_PKL) self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_DUMP), arcname=BOOTSTRAP_CFGFILE_DUMP) #debug directory configtmp = tempfile.NamedTemporaryFile(delete=True) configtmp.write(str(self.config)) configtmp.flush() psetfilename = getattr(self.config.JobType, 'psetName', None) if not psetfilename == None: self.tarfile.add(psetfilename, '/debug/originalPSet.py') else: self.logger.debug('Failed to add pset to tarball') self.tarfile.add(configtmp.name, '/debug/crabConfig.py') configtmp.close() def writeContent(self): """Save the content of the tarball""" self.content = [(int(x.size), x.name) for x in self.tarfile.getmembers()] def close(self): """ Calculate the checkum and close """ self.writeContent() return self.tarfile.close() def upload(self, filecacheurl=None): """ Upload the tarball to the File Cache """ self.close() archiveName = self.tarfile.name self.logger.debug( "Uploading archive %s to the CRAB cache. Using URI %s" % (archiveName, filecacheurl)) ufc = CRABClient.Emulator.getEmulator('ufc')({ 'endpoint': filecacheurl, "pycurl": True }) result = ufc.upload(archiveName, excludeList=USER_SANDBOX_EXCLUSIONS) if 'hashkey' not in result: self.logger.error("Failed to upload source files: %s" % str(result)) raise CachefileNotFoundException return str(result['hashkey']) def checkdirectory(self, dir_): #checking for infinite symbolic link loop try: for root, _, files in os.walk(dir_, followlinks=True): for file_ in files: os.stat(os.path.join(root, file_)) except OSError as msg: err = '%sError%s: Infinite directory loop found in: %s \nStderr: %s' % \ (colors.RED, colors.NORMAL, dir_ , msg) raise EnvironmentException(err) def __getattr__(self, *args): """ Pass any unknown functions or attribute requests on to the TarFile object """ self.logger.debug("Passing getattr %s on to TarFile" % args) return self.tarfile.__getattribute__(*args) def __enter__(self): """ Allow use as context manager """ return self def __exit__(self, excType, excValue, excTrace): """ Allow use as context manager """ self.tarfile.close() if excType: return False
class UserTarball(object): """ _UserTarball_ A subclass of TarFile for the user code tarballs. By default creates a new tarball with the user libraries from lib, module, and the data/ and interface/ sections of the src/ area. Also adds user specified files in the right place. """ def __init__(self, name=None, mode='w:gz', config=None, logger=None): self.config = config self.logger = logger self.scram = ScramEnvironment(logger=self.logger) self.logger.debug("Making tarball in %s" % name) self.tarfile = tarfile.open(name=name , mode=mode, dereference=True) self.checksum = None def addFiles(self, userFiles=None, cfgOutputName=None): """ Add the necessary files to the tarball """ directories = ['lib', 'biglib', 'module'] if getattr(self.config.JobType, 'sendPythonFolder', configParametersInfo['JobType.sendPythonFolder']['default']): directories.append('python') # /data/ subdirs contain data files needed by the code # /interface/ subdirs contain C++ header files needed e.g. by ROOT6 dataDirs = ['data','interface'] userFiles = userFiles or [] # Tar up whole directories for directory in directories: fullPath = os.path.join(self.scram.getCmsswBase(), directory) self.logger.debug(" checking directory %s" % fullPath) if os.path.exists(fullPath): self.logger.debug(" adding directory %s to tarball" % fullPath) self.checkdirectory(fullPath) self.tarfile.add(fullPath, directory, recursive=True) # Search for and tar up "data" directories in src/ srcPath = os.path.join(self.scram.getCmsswBase(), 'src') for root, _dummy, _dummy in os.walk(srcPath): if os.path.basename(root) in dataDirs: directory = root.replace(srcPath,'src') self.logger.debug(" adding data directory %s to tarball" % root) self.checkdirectory(root) self.tarfile.add(root, directory, recursive=True) # Tar up extra files the user needs for globName in userFiles: fileNames = glob.glob(globName) if not fileNames: raise InputFileNotFoundException('The input file "%s" taken from parameter config.JobType.inputFiles cannot be found' % globName) for filename in fileNames: self.logger.debug(" adding file %s to tarball" % filename) self.checkdirectory(filename) self.tarfile.add(filename, os.path.basename(filename), recursive=True) scriptExe = getattr(self.config.JobType, 'scriptExe', None) if scriptExe: self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe)) # Adding the pset and crabconfig file to the tarfile if cfgOutputName: self.tarfile.add(cfgOutputName, arcname='PSet.py') self.tarfile.add(os.path.splitext(cfgOutputName)[0]+'.pkl', arcname='PSet.pkl') configtmp = tempfile.NamedTemporaryFile(delete=True) configtmp.write(str(self.config)) configtmp.flush() psetfilename = getattr(self.config.JobType, 'psetName', None) if not psetfilename == None: self.tarfile.add(psetfilename,'/debug/originalPSet.py') else: self.logger.debug('Failed to add pset to tarball') self.tarfile.add(configtmp.name, '/debug/crabConfig.py') configtmp.close() def close(self): """ Calculate the checkum and clos """ self.calculateChecksum() return self.tarfile.close() def upload(self, filecacheurl=None): """ Upload the tarball to the File Cache """ self.close() archiveName = self.tarfile.name self.logger.debug(" uploading archive to cache %s " % archiveName) ufc = CRABClient.Emulator.getEmulator('ufc')({'endpoint' : filecacheurl}) result = ufc.upload(archiveName) if 'hashkey' not in result: self.logger.error("Failed to upload source files: %s" % str(result)) raise CachefileNotFoundException return str(result['hashkey']) + '.tar.gz', self.checksum def calculateChecksum(self): """ Calculate a checksum that doesn't depend on the tgz creation data """ lsl = [(x.name, int(x.size), int(x.mtime), x.uname) for x in self.tarfile.getmembers()] hasher = hashlib.md5(str(lsl)) self.logger.debug('tgz contents: %s' % lsl) self.checksum = hasher.hexdigest() self.logger.debug('MD5 checksum: %s' % self.checksum) #Old way reads in the file again. May use for for non-tar files if needed. #sha256sum = hashlib.sha256() #with open(self.tarfile.name, 'rb') as f: #while True: #chunkdata = f.read(8192) #if not chunkdata: #break #sha256sum.update(chunkdata) #sha256sum.hexdigest() def checkdirectory(self, dir_): #checking for infinite symbolic link loop try: for root , _ , files in os.walk(dir_, followlinks = True): for file_ in files: os.stat(os.path.join(root, file_ )) except OSError as msg: err = '%sError%s: Infinite directory loop found in: %s \nStderr: %s' % \ (colors.RED, colors.NORMAL, dir_ , msg) raise EnvironmentException(err) def __getattr__(self, *args): """ Pass any unknown functions or attribute requests on to the TarFile object """ self.logger.debug("Passing getattr %s on to TarFile" % args) return self.tarfile.__getattribute__(*args) def __enter__(self): """ Allow use as context manager """ return self def __exit__(self, excType, excValue, excTrace): """ Allow use as context manager """ self.tarfile.close() if excType: return False
def run(self, filecacheurl=None): """ Override run() for JobType """ configArguments = { 'addoutputfiles': [], 'adduserfiles': [], 'tfileoutfiles': [], 'edmoutfiles': [], } if getattr(self.config.Data, 'useParent', False) and getattr( self.config.Data, 'secondaryInputDataset', None): msg = "Invalid CRAB configuration: Parameters Data.useParent and Data.secondaryInputDataset cannot be used together." raise ConfigurationException(msg) # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({ 'jobarch': scram.getScramArch(), 'jobsw': scram.getCmsswVersion() }) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID + 'default.tgz') cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE) else: raise EnvironmentException( 'Problem with uuidgen while preparing for Sandbox upload.') else: _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz') _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset ## Create CMSSW config. self.logger.debug("self.config: %s" % (self.config)) self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName)) ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent ## in the sense that a second loading of the same pset may not produce the same ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW ## pset twice. However, some "complicated" psets seem to evade the caching. ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that ## it can be reused later if wanted (for example, in PrivateMC when checking if ## the pset has an LHE source) instead of having to load the pset again. ## As for what does "complicated" psets mean, Daniel Riley said that there are ## some psets where one module modifies the configuration from another module. self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## If there is a CMSSW pset, do a basic validation of it. if not bootstrapDone() and self.config.JobType.psetName: valid, msg = self.cmsswCfg.validateConfig() if not valid: raise ConfigurationException(msg) ## We need to put the pickled CMSSW configuration in the right place. ## Here, we determine if the bootstrap script already run and prepared everything ## for us. In such case we move the file, otherwise we pickle.dump the pset if not bootstrapDone(): # Write out CMSSW config self.cmsswCfg.writeFile(cfgOutputName) else: # Move the pickled and the configuration files created by the bootstrap script self.moveCfgFile(cfgOutputName) ## Interrogate the CMSSW pset for output files (only output files produced by ## PoolOutputModule or TFileService are identified automatically). Do this ## automatic detection even if JobType.disableAutomaticOutputCollection = True, ## so that we can still classify the output files in EDM, TFile and additional ## output files in the Task DB (and the job ad). ## TODO: Do we really need this classification at all? cmscp and PostJob read ## the FJR to know if an output file is EDM, TFile or other. edmfiles, tfiles = self.cmsswCfg.outputFiles() ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile ## output files that are not listed in JobType.outputFiles. if getattr( self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue( 'JobType.disableAutomaticOutputCollection')): outputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) ] edmfiles = [file for file in edmfiles if file in outputFiles] tfiles = [file for file in tfiles if file in outputFiles] ## Get the list of additional output files that have to be collected as given ## in JobType.outputFiles, but remove duplicates listed already as EDM files or ## TFiles. addoutputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles + tfiles ] self.logger.debug( "The following EDM output files will be collected: %s" % edmfiles) self.logger.debug( "The following TFile output files will be collected: %s" % tfiles) self.logger.debug( "The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) ## Give warning message in case no output file was detected in the CMSSW pset ## nor was any specified in the CRAB configuration. if not configArguments['edmoutfiles'] and not configArguments[ 'tfileoutfiles'] and not configArguments['addoutputfiles']: msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) if getattr( self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue( 'JobType.disableAutomaticOutputCollection')): msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration" msg += " and no output file was explicitly specified in the CRAB configuration." else: msg += " CRAB could not detect any output file in the CMSSW configuration" msg += " nor was any explicitly specified in the CRAB configuration." msg += " Hence CRAB will not collect any output file from this task." self.logger.warning(msg) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', []) ] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [ os.path.basename(f) for f in inputFiles ] try: uploadResult = tb.upload(filecacheurl=filecacheurl) except HTTPException as hte: if 'X-Error-Info' in hte.headers: reason = hte.headers['X-Error-Info'] reason_re = re.compile( r'\AFile size is ([0-9]*)B\. This is bigger than the maximum allowed size of ([0-9]*)B\.$' ) re_match = reason_re.match(reason) if re_match: ISBSize = int(re_match.group(1)) ISBSizeLimit = int(re_match.group(2)) reason = "%sError%s:" % (colors.RED, colors.NORMAL) reason += " Input sanbox size is ~%sMB. This is bigger than the maximum allowed size of %sMB." % ( ISBSize / 1024 / 1024, ISBSizeLimit / 1024 / 1024) ISBContent = sorted(tb.content, reverse=True) biggestFileSize = ISBContent[0][0] ndigits = int( math.ceil(math.log(biggestFileSize + 1, 10))) reason += "\nInput sanbox content sorted by size[Bytes]:" for (size, name) in ISBContent: reason += ("\n%" + str(ndigits) + "s\t%s") % (size, name) raise ClientException(reason) raise hte except Exception as e: msg = ( "Impossible to calculate the checksum of the sandbox tarball.\nError message: %s.\n" "More details can be found in %s" % (e, self.logger.logfile)) LOGGERS['CRAB3'].exception( msg) #the traceback is only printed into the logfile raise ClientException(msg) configArguments['cacheurl'] = filecacheurl configArguments['cachefilename'] = "%s.tar.gz" % uploadResult self.logger.debug("Result uploading input files: %(cachefilename)s " % configArguments) # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, 'userInputFiles', None) if userFilesList: self.logger.debug( "Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments['userfiles'] = set(userFilesList) configArguments['primarydataset'] = getattr( self.config.Data, 'outputPrimaryDataset', 'CRAB_UserFiles') lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) try: lumi_list = getLumiList(lumi_mask_name, logger=self.logger) except ValueError as ex: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex) raise ConfigurationException(msg) run_ranges = getattr(self.config.Data, 'runRange', None) if run_ranges: run_ranges_is_valid = re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) if not lumi_list: msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null." raise ConfigurationException(msg) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str( len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs=run_list) else: msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'" raise ConfigurationException(msg) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [ str(reduce(lambda x, y: x + y, lumi_mask[run]))[1:-1].replace(' ', '') for run in configArguments['runs'] ] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments
def run(self, requestConfig): """ Override run() for JobType """ configArguments = { 'addoutputfiles': [], 'adduserfiles': [], 'tfileoutfiles': [], 'edmoutfiles': [], } # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({ 'jobarch': scram.scramArch, 'jobsw': scram.cmsswVersion, }) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID + 'default.tgz') cfgOutputName = os.path.join(self.workdir, 'CMSSW_cfg.py') else: raise EnvironmentException( 'Problem with uuidgen while preparing for Sandbox upload.') else: _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz') _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') #configArguments['userisburl'] = 'https://'+ self.config.General.ufccacheUrl + '/crabcache/file?hashkey=' + uploadResults['hashkey']#XXX hardcoded #configArguments['userisburl'] = 'INSERTuserisburl'#XXX hardcoded if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset # configArguments['ProcessingVersion'] = getattr(self.config.Data, 'processingVersion', None) # Create CMSSW config self.logger.debug("self.config: %s" % self.config) self.logger.debug("self.config.JobType.psetName: %s" % self.config.JobType.psetName) cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## Interogate CMSSW config and user config for output file names. For now no use for EDM files or TFiles here. edmfiles, tfiles = cmsswCfg.outputFiles() addoutputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles + tfiles ] self.logger.debug( "The following EDM output files will be collected: %s" % edmfiles) self.logger.debug( "The following TFile output files will be collected: %s" % tfiles) self.logger.debug( "The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) # Write out CMSSW config cmsswCfg.writeFile(cfgOutputName) with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [ re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', []) ] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [ os.path.basename(f) for f in inputFiles ] uploadResults = tb.upload() self.logger.debug("Result uploading input files: %s " % str(uploadResults)) configArguments['cachefilename'] = uploadResults[1] configArguments['cacheurl'] = uploadResults[0] isbchecksum = uploadResults[2] # Upload list of user-defined input files to process as the primary input userFileName = getattr(self.config.Data, 'userInputFile', None) if userFileName: self.logger.debug( "Attaching a list of user-specified primary input files from %s." % userFileName) fnames = [] for fname in open(userFileName).readlines(): fnames.append(fname.strip()) configArguments['userfiles'] = filter( lambda x: x, fnames) #removing whitelines and empty objects primDS = getattr(self.config.Data, 'primaryDataset', None) if primDS: # Normalizes "foo/bar" and "/foo/bar" to "/foo/bar" primDS = "/" + os.path.join(*primDS.split("/")) if not re.match("/%(primDS)s.*" % lfnParts, primDS): self.logger.warning( "Invalid primary dataset name %s for private MC; publishing may fail" % primDS) configArguments['inputdata'] = primDS else: configArguments['inputdata'] = getattr(self.config.Data, 'inputDataset', '/CRAB_UserFiles') lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % lumi_mask_name) lumi_list = getLumiList(lumi_mask_name, logger=self.logger) run_ranges = getattr(self.config.Data, 'runRange', None) run_ranges_is_valid = run_ranges is not None and isinstance( run_ranges, str) and re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) else: if len(run_list) > 50000: msg = "Data.runRange includes %s runs." % str( len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs=run_list) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [ str(reduce(lambda x, y: x + y, lumi_mask[run]))[1:-1].replace(' ', '') for run in configArguments['runs'] ] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments, isbchecksum
def run(self, requestConfig): """ Override run() for JobType """ configArguments = { 'addoutputfiles': [], 'adduserfiles': [], 'tfileoutfiles': [], 'edmoutfiles': [], } # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({ 'jobarch': scram.scramArch, 'jobsw': scram.cmsswVersion, }) # Build tarball if self.workdir: tarFilename = os.path.join( self.workdir, PandaInterface.wrappedUuidGen() + 'default.tgz') cfgOutputName = os.path.join(self.workdir, 'CMSSW_cfg.py') else: _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz') _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') #configArguments['userisburl'] = 'https://'+ self.config.General.ufccacheUrl + '/crabcache/file?hashkey=' + uploadResults['hashkey']#XXX hardcoded #configArguments['userisburl'] = 'INSERTuserisburl'#XXX hardcoded if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset # configArguments['ProcessingVersion'] = getattr(self.config.Data, 'processingVersion', None) # Create CMSSW config self.logger.debug("self.config: %s" % self.config) self.logger.debug("self.config.JobType.psetName: %s" % self.config.JobType.psetName) cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) # Interogate CMSSW config and user config for output file names, for now no use for edmFiles or TFiles here. analysisFiles, edmFiles = cmsswCfg.outputFiles() self.logger.debug("TFiles %s and EDM Files %s will be collected" % (analysisFiles, edmFiles)) configArguments['tfileoutfiles'] = analysisFiles configArguments['edmoutfiles'] = edmFiles outputFiles = getattr(self.config.JobType, 'outputFiles', []) self.logger.debug("User files %s will be collected" % outputFiles) configArguments['addoutputfiles'].extend(outputFiles) # Write out CMSSW config cmsswCfg.writeFile(cfgOutputName) with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = getattr(self.config.JobType, 'inputFiles', []) tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [ os.path.basename(f) for f in inputFiles ] uploadResults = tb.upload() self.logger.debug("Result uploading input files: %s " % str(uploadResults)) configArguments['cachefilename'] = uploadResults[1] configArguments['cacheurl'] = uploadResults[0] isbchecksum = uploadResults[2] # Upload lumi mask if it exists lumiMaskName = getattr(self.config.Data, 'lumiMask', None) if lumiMaskName: self.logger.debug("Attaching lumi mask %s to the request" % lumiMaskName) lumiDict = getLumiMask(config=self.config, logger=self.logger) configArguments['runs'] = lumiDict.keys() #for each run we'll encode the lumis as a string representing a list of integers #[[1,2],[5,5]] ==> '1,2,5,5' configArguments['lumis'] = [ str(reduce(lambda x,y: x+y, \ lumiDict[run]))[1:-1].replace(' ','') \ for run in configArguments['runs'] ] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments, isbchecksum
class UserTarball(object): """ _UserTarball_ A subclass of TarFile for the user code tarballs. By default creates a new tarball with the user libraries from lib, module, and the data/ and interface/ sections of the src/ area. Also adds user specified files in the right place. """ def __init__(self, name=None, mode='w:bz2', config=None, logger=None, crabserver=None, s3tester=None): self.config = config self.logger = logger self.scram = ScramEnvironment(logger=self.logger) self.logger.debug("Making tarball in %s" % name) self.tarfile = tarfile.open(name=name, mode=mode, dereference=True) self.checksum = None self.content = None self.crabserver = crabserver self.s3tester = s3tester def addFiles(self, userFiles=None, cfgOutputName=None): """ Add the necessary files to the tarball """ directories = ['lib', 'biglib', 'module'] if getattr(self.config.JobType, 'sendPythonFolder', configParametersInfo['JobType.sendPythonFolder']['default']): directories.append('python') directories.append('cfipython') if getattr(self.config.JobType, 'sendExternalFolder', configParametersInfo['JobType.sendExternalFolder']['default']): externalDirPath = os.path.join(self.scram.getCmsswBase(), 'external') if os.path.exists(externalDirPath) and os.listdir(externalDirPath) != []: directories.append('external') else: self.logger.info("The config.JobType.sendExternalFolder parameter is set to True but the external directory "\ "doesn't exist or is empty, not adding to tarball. Path: %s" % externalDirPath) # Note that dataDirs are only looked-for and added under the src/ folder. # /data/ subdirs contain data files needed by the code # /interface/ subdirs contain C++ header files needed e.g. by ROOT6 dataDirs = ['data', 'interface'] userFiles = userFiles or [] # Tar up whole directories for directory in directories: fullPath = os.path.join(self.scram.getCmsswBase(), directory) self.logger.debug("Checking directory %s" % fullPath) if os.path.exists(fullPath): self.logger.debug("Adding directory %s to tarball" % fullPath) self.checkdirectory(fullPath) self.tarfile.add(fullPath, directory, recursive=True) # Search for and tar up "data" directories in src/ srcPath = os.path.join(self.scram.getCmsswBase(), 'src') for root, _, _ in os.walk(srcPath): if os.path.basename(root) in dataDirs: directory = root.replace(srcPath, 'src') self.logger.debug("Adding data directory %s to tarball" % root) self.checkdirectory(root) self.tarfile.add(root, directory, recursive=True) # Tar up extra files the user needs for globName in userFiles: fileNames = glob.glob(globName) if not fileNames: raise InputFileNotFoundException("The input file '%s' taken from parameter config.JobType.inputFiles cannot be found." % globName) for filename in fileNames: self.logger.debug("Adding file %s to tarball" % filename) self.checkdirectory(filename) self.tarfile.add(filename, os.path.basename(filename), recursive=True) scriptExe = getattr(self.config.JobType, 'scriptExe', None) if scriptExe: self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe)) # Adding the pset files to the tarfile if cfgOutputName: basedir = os.path.dirname(cfgOutputName) self.tarfile.add(cfgOutputName, arcname=BOOTSTRAP_CFGFILE) self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_PKL), arcname=BOOTSTRAP_CFGFILE_PKL) self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_DUMP), arcname=BOOTSTRAP_CFGFILE_DUMP) def addMonFiles(self): """ Add monitoring files the debug tarball. """ configtmp = tempfile.NamedTemporaryFile(mode='w', delete=True) configtmp.write(str(self.config)) configtmp.flush() psetfilename = getattr(self.config.JobType, 'psetName', None) if psetfilename: self.tarfile.add(psetfilename, '/debug/originalPSet.py') else: self.logger.debug('Failed to add pset to debug_files.tar.gz') self.tarfile.add(configtmp.name, '/debug/crabConfig.py') scriptExe = getattr(self.config.JobType, 'scriptExe', None) if scriptExe: self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe)) configtmp.close() def writeContent(self): """Save the content of the tarball""" self.content = [(int(x.size), x.name) for x in self.tarfile.getmembers()] def close(self): """ Calculate the checkum and close """ self.writeContent() return self.tarfile.close() def printSortedContent(self): """ To be used for diagnostic printouts returns a string containing tarball content as a list of files sorted by size already formatted for use in a print statement """ sortedContent = sorted(self.content, reverse=True) biggestFileSize = sortedContent[0][0] ndigits = int(math.ceil(math.log(biggestFileSize+1, 10))) contentList = "\nsandbox content sorted by size[Bytes]:" for (size, name) in sortedContent: contentList += ("\n%" + str(ndigits) + "s\t%s") % (size, name) return contentList def upload(self, filecacheurl=None): """ Upload the tarball to the File Cache """ self.close() archiveName = self.tarfile.name archiveSizeBytes = os.path.getsize(archiveName) # in python3 and python2 with __future__ division, double / means integer division archiveSizeKB = archiveSizeBytes//1024 if archiveSizeKB <= 512: archiveSize = "%d KB" % archiveSizeKB elif archiveSizeKB < 1024*10: # in python3 and python2 with __future__ division, single / means floating point division archiveSize = "%3f.1 MB" % (archiveSizeKB/1024) else: archiveSize = "%d MB" % (archiveSizeKB//1024) if archiveSizeBytes > FILE_SIZE_LIMIT: msg = ("%sError%s: input tarball size %s exceeds maximum allowed limit of %d MB" % (colors.RED, colors.NORMAL, archiveSize, FILE_SIZE_LIMIT//1024//1024)) msg += self.printSortedContent() raise SandboxTooBigException(msg) msg = ("Uploading archive %s (%s) to the CRAB cache. Using URI %s" % (archiveName, archiveSize, filecacheurl)) self.logger.debug(msg) if 'S3' in filecacheurl.upper(): # use S3 # generate a 32char hash like UserFileCache used to do hashkey = calculateChecksum(archiveName, exclude=NEW_USER_SANDBOX_EXCLUSIONS) # the ".tar.gz" suffix here is forced by other places in the client which add it when # storing tarball name in task table. Not very elegant to need to hardcode in several places. cachename = "%s.tar.gz" % hashkey # current code requires a taskname to extract username. Any dummy one will do # next version of RESTCache will get username from cmsweb FE headers uploadToS3(crabserver=self.crabserver, objecttype='sandbox', filepath=archiveName, tarballname=cachename, logger=self.logger) else: # old way using UFC ufc = CRABClient.Emulator.getEmulator('ufc')({'endpoint' : filecacheurl, "pycurl": True}) t1 = time.time() result = ufc.upload(archiveName, excludeList=NEW_USER_SANDBOX_EXCLUSIONS) ufcSeconds = int(time.time()-t1) if 'hashkey' not in result: self.logger.error("Failed to upload archive: %s" % str(result)) raise CachefileNotFoundException hashkey = str(result['hashkey']) # upload a copy to S3 dev as well, just to stress it a bit, this never raises s3report = testS3upload(self.s3tester, archiveName, hashkey, self.logger) # report also how long it took uploading to UFC (which surely worked if we are here) s3report['ufcseconds'] = ufcSeconds # upload S3 test report to crabcache reportFile = '/tmp/crabs3report.' + uuid.uuid4().hex with open(reportFile, 'w') as fp: json.dump(s3report, fp) reportName = 'S3-' + s3report['timestamp'] + ':s3report.json' try: ufc.uploadLog(reportFile, reportName) self.logger.debug('Report of S3 upload stored on CrabCache as %s', reportName) except Exception as e: self.logger.debug(str(e)) os.remove(reportFile) return hashkey def checkdirectory(self, dir_): #checking for infinite symbolic link loop try: for root, _, files in os.walk(dir_, followlinks=True): for file_ in files: os.stat(os.path.join(root, file_)) except OSError as msg: err = '%sError%s: Infinite directory loop found in: %s \nStderr: %s' % \ (colors.RED, colors.NORMAL, dir_, msg) raise EnvironmentException(err) def __getattr__(self, *args): """ Pass any unknown functions or attribute requests on to the TarFile object """ self.logger.debug("Passing getattr %s on to TarFile" % args) return self.tarfile.__getattribute__(*args) def __enter__(self): """ Allow use as context manager """ return self def __exit__(self, excType, excValue, excTrace): """ Allow use as context manager """ self.tarfile.close() if excType: return False
class UserTarball(object): """ _UserTarball_ A subclass of TarFile for the user code tarballs. By default creates a new tarball with the user libraries from lib, module, and the data/ and interface/ sections of the src/ area. Also adds user specified files in the right place. """ def __init__(self, name=None, mode='w:gz', config=None, logger=None): self.config = config self.logger = logger self.scram = ScramEnvironment(logger=self.logger) self.logger.debug("Making tarball in %s" % name) self.tarfile = tarfile.open(name=name , mode=mode, dereference=True) self.checksum = None def addFiles(self, userFiles=None, cfgOutputName=None): """ Add the necessary files to the tarball """ directories = ['lib', 'biglib', 'module'] if getattr(self.config.JobType, 'sendPythonFolder', configParametersInfo['JobType.sendPythonFolder']['default']): directories.append('python') # /data/ subdirs contain data files needed by the code # /interface/ subdirs contain C++ header files needed e.g. by ROOT6 dataDirs = ['data','interface'] userFiles = userFiles or [] # Tar up whole directories for directory in directories: fullPath = os.path.join(self.scram.getCmsswBase(), directory) self.logger.debug("Checking directory %s" % fullPath) if os.path.exists(fullPath): self.logger.debug("Adding directory %s to tarball" % fullPath) self.checkdirectory(fullPath) self.tarfile.add(fullPath, directory, recursive=True) # Search for and tar up "data" directories in src/ srcPath = os.path.join(self.scram.getCmsswBase(), 'src') for root, _dummy, _dummy in os.walk(srcPath): if os.path.basename(root) in dataDirs: directory = root.replace(srcPath,'src') self.logger.debug("Adding data directory %s to tarball" % root) self.checkdirectory(root) self.tarfile.add(root, directory, recursive=True) # Tar up extra files the user needs for globName in userFiles: fileNames = glob.glob(globName) if not fileNames: raise InputFileNotFoundException("The input file '%s' taken from parameter config.JobType.inputFiles cannot be found." % globName) for filename in fileNames: self.logger.debug("Adding file %s to tarball" % filename) self.checkdirectory(filename) self.tarfile.add(filename, os.path.basename(filename), recursive=True) scriptExe = getattr(self.config.JobType, 'scriptExe', None) if scriptExe: self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe)) # Adding the pset files to the tarfile if cfgOutputName: basedir = os.path.dirname(cfgOutputName) self.tarfile.add(cfgOutputName, arcname=BOOTSTRAP_CFGFILE) self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_PKL), arcname=BOOTSTRAP_CFGFILE_PKL) self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_DUMP), arcname=BOOTSTRAP_CFGFILE_DUMP) #debug directory configtmp = tempfile.NamedTemporaryFile(delete=True) configtmp.write(str(self.config)) configtmp.flush() psetfilename = getattr(self.config.JobType, 'psetName', None) if not psetfilename == None: self.tarfile.add(psetfilename,'/debug/originalPSet.py') else: self.logger.debug('Failed to add pset to tarball') self.tarfile.add(configtmp.name, '/debug/crabConfig.py') configtmp.close() def writeContent(self): """Save the content of the tarball""" self.content = [(int(x.size), x.name) for x in self.tarfile.getmembers()] def close(self): """ Calculate the checkum and close """ self.writeContent() return self.tarfile.close() def upload(self, filecacheurl=None): """ Upload the tarball to the File Cache """ self.close() archiveName = self.tarfile.name self.logger.debug("Uploading archive %s to the CRAB cache. Using URI %s" % (archiveName, filecacheurl)) ufc = CRABClient.Emulator.getEmulator('ufc')({'endpoint' : filecacheurl, "pycurl": True}) result = ufc.upload(archiveName, excludeList = USER_SANDBOX_EXCLUSIONS) if 'hashkey' not in result: self.logger.error("Failed to upload source files: %s" % str(result)) raise CachefileNotFoundException return str(result['hashkey']) def checkdirectory(self, dir_): #checking for infinite symbolic link loop try: for root , _ , files in os.walk(dir_, followlinks = True): for file_ in files: os.stat(os.path.join(root, file_ )) except OSError as msg: err = '%sError%s: Infinite directory loop found in: %s \nStderr: %s' % \ (colors.RED, colors.NORMAL, dir_ , msg) raise EnvironmentException(err) def __getattr__(self, *args): """ Pass any unknown functions or attribute requests on to the TarFile object """ self.logger.debug("Passing getattr %s on to TarFile" % args) return self.tarfile.__getattribute__(*args) def __enter__(self): """ Allow use as context manager """ return self def __exit__(self, excType, excValue, excTrace): """ Allow use as context manager """ self.tarfile.close() if excType: return False
class UserTarball(object): """ _UserTarball_ A subclass of TarFile for the user code tarballs. By default creates a new tarball with the user libraries from lib, module, and the data/ and interface/ sections of the src/ area. Also adds user specified files in the right place. """ def __init__(self, name=None, mode='w:bz2', config=None, logger=None): self.config = config self.logger = logger self.scram = ScramEnvironment(logger=self.logger) self.logger.debug("Making tarball in %s" % name) self.tarfile = tarfile.open(name=name , mode=mode, dereference=True) self.checksum = None self.content = None def addFiles(self, userFiles=None, cfgOutputName=None): """ Add the necessary files to the tarball """ directories = ['lib', 'biglib', 'module'] if getattr(self.config.JobType, 'sendPythonFolder', configParametersInfo['JobType.sendPythonFolder']['default']): directories.append('python') directories.append('cfipython') if getattr(self.config.JobType, 'sendExternalFolder', configParametersInfo['JobType.sendExternalFolder']['default']): externalDirPath = os.path.join(self.scram.getCmsswBase(), 'external') if os.path.exists(externalDirPath) and os.listdir(externalDirPath) != []: directories.append('external') else: self.logger.info("The config.JobType.sendExternalFolder parameter is set to True but the external directory "\ "doesn't exist or is empty, not adding to tarball. Path: %s" % externalDirPath) # Note that dataDirs are only looked-for and added under the src/ folder. # /data/ subdirs contain data files needed by the code # /interface/ subdirs contain C++ header files needed e.g. by ROOT6 dataDirs = ['data','interface'] userFiles = userFiles or [] # Tar up whole directories for directory in directories: fullPath = os.path.join(self.scram.getCmsswBase(), directory) self.logger.debug("Checking directory %s" % fullPath) if os.path.exists(fullPath): self.logger.debug("Adding directory %s to tarball" % fullPath) self.checkdirectory(fullPath) self.tarfile.add(fullPath, directory, recursive=True) # Search for and tar up "data" directories in src/ srcPath = os.path.join(self.scram.getCmsswBase(), 'src') for root, _, _ in os.walk(srcPath): if os.path.basename(root) in dataDirs: directory = root.replace(srcPath,'src') self.logger.debug("Adding data directory %s to tarball" % root) self.checkdirectory(root) self.tarfile.add(root, directory, recursive=True) # Tar up extra files the user needs for globName in userFiles: fileNames = glob.glob(globName) if not fileNames: raise InputFileNotFoundException("The input file '%s' taken from parameter config.JobType.inputFiles cannot be found." % globName) for filename in fileNames: self.logger.debug("Adding file %s to tarball" % filename) self.checkdirectory(filename) self.tarfile.add(filename, os.path.basename(filename), recursive=True) scriptExe = getattr(self.config.JobType, 'scriptExe', None) if scriptExe: self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe)) # Adding the pset files to the tarfile if cfgOutputName: basedir = os.path.dirname(cfgOutputName) self.tarfile.add(cfgOutputName, arcname=BOOTSTRAP_CFGFILE) self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_PKL), arcname=BOOTSTRAP_CFGFILE_PKL) self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_DUMP), arcname=BOOTSTRAP_CFGFILE_DUMP) def addMonFiles(self): """ Add monitoring files the debug tarball. """ configtmp = tempfile.NamedTemporaryFile(delete=True) configtmp.write(str(self.config)) configtmp.flush() psetfilename = getattr(self.config.JobType, 'psetName', None) if not psetfilename == None: self.tarfile.add(psetfilename,'/debug/originalPSet.py') else: self.logger.debug('Failed to add pset to debug_files.tar.gz') self.tarfile.add(configtmp.name, '/debug/crabConfig.py') scriptExe = getattr(self.config.JobType, 'scriptExe', None) if scriptExe: self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe)) configtmp.close() def writeContent(self): """Save the content of the tarball""" self.content = [(int(x.size), x.name) for x in self.tarfile.getmembers()] def close(self): """ Calculate the checkum and close """ self.writeContent() return self.tarfile.close() def printSortedContent(self): """ To be used for diagnostic printouts returns a string containing tarball content as a list of files sorted by size already formatted for use in a print statement """ sortedContent = sorted(self.content, reverse=True) biggestFileSize = sortedContent[0][0] ndigits = int(math.ceil(math.log(biggestFileSize+1, 10))) contentList = "\nsandbox content sorted by size[Bytes]:" for (size, name) in sortedContent: contentList += ("\n%" + str(ndigits) + "s\t%s") % (size, name) return contentList def upload(self, filecacheurl=None): """ Upload the tarball to the File Cache """ self.close() archiveName = self.tarfile.name archiveSizeBytes = os.path.getsize(archiveName) # in python3 and python2 with __future__ division, double / means integer division archiveSizeKB = archiveSizeBytes//1024 if archiveSizeKB <= 512 : archiveSize = "%d KB" % archiveSizeKB elif archiveSizeKB < 1024*10 : # in python3 and python2 with __future__ division, single / means floating point division archiveSize = "%3f.1 MB" % (archiveSizeKB/1024) else: archiveSize = "%d MB" % (archiveSizeKB//1024) if archiveSizeBytes > FILE_SIZE_LIMIT : msg=("%sError%s: input tarball size %s exceeds maximum allowed limit of %d MB" % (colors.RED, colors.NORMAL, archiveSize, FILE_SIZE_LIMIT//1024//1024)) msg += self.printSortedContent() raise SandboxTooBigException(msg) msg=("Uploading archive %s (%s) to the CRAB cache. Using URI %s" % (archiveName, archiveSize, filecacheurl)) self.logger.debug(msg) ufc = CRABClient.Emulator.getEmulator('ufc')({'endpoint' : filecacheurl, "pycurl": True}) result = ufc.upload(archiveName, excludeList = NEW_USER_SANDBOX_EXCLUSIONS) if 'hashkey' not in result: self.logger.error("Failed to upload archive: %s" % str(result)) raise CachefileNotFoundException return str(result['hashkey']) def checkdirectory(self, dir_): #checking for infinite symbolic link loop try: for root , _ , files in os.walk(dir_, followlinks = True): for file_ in files: os.stat(os.path.join(root, file_ )) except OSError as msg: err = '%sError%s: Infinite directory loop found in: %s \nStderr: %s' % \ (colors.RED, colors.NORMAL, dir_ , msg) raise EnvironmentException(err) def __getattr__(self, *args): """ Pass any unknown functions or attribute requests on to the TarFile object """ self.logger.debug("Passing getattr %s on to TarFile" % args) return self.tarfile.__getattribute__(*args) def __enter__(self): """ Allow use as context manager """ return self def __exit__(self, excType, excValue, excTrace): """ Allow use as context manager """ self.tarfile.close() if excType: return False
def checkAutomaticAvail(self, allowedSplitAlgos): scram = ScramEnvironment(logger=self.logger) major, minor = [int(v) for v in scram.getCmsswVersion().split('_', 3)[1:-1]] if major > 7 or (major == 7 and minor >= 2): allowedSplitAlgos.append('Automatic')
def run(self, filecacheurl = None): """ Override run() for JobType """ configArguments = {'addoutputfiles' : [], 'adduserfiles' : [], 'tfileoutfiles' : [], 'edmoutfiles' : [], } # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({'jobarch' : scram.scramArch, 'jobsw' : scram.cmsswVersion, }) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID +'default.tgz') cfgOutputName = os.path.join(self.workdir, 'CMSSW_cfg.py') else: raise EnvironmentException('Problem with uuidgen while preparing for Sandbox upload.') else: _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz') _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset # configArguments['ProcessingVersion'] = getattr(self.config.Data, 'processingVersion', None) # Create CMSSW config self.logger.debug("self.config: %s" % self.config) self.logger.debug("self.config.JobType.psetName: %s" % self.config.JobType.psetName) cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## Interogate CMSSW config and user config for output file names. For now no use for EDM files or TFiles here. edmfiles, tfiles = cmsswCfg.outputFiles() addoutputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles+tfiles] self.logger.debug("The following EDM output files will be collected: %s" % edmfiles) self.logger.debug("The following TFile output files will be collected: %s" % tfiles) self.logger.debug("The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) # Write out CMSSW config cmsswCfg.writeFile(cfgOutputName) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', [])] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [os.path.basename(f) for f in inputFiles] uploadResults = tb.upload(filecacheurl = filecacheurl) self.logger.debug("Result uploading input files: %s " % str(uploadResults)) configArguments['cacheurl'] = filecacheurl configArguments['cachefilename'] = uploadResults[0] isbchecksum = uploadResults[1] # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, 'userInputFiles', None) if userFilesList: self.logger.debug("Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s: CRAB configuration parameter Data.userInputFiles contains duplicated entries." % (colors.RED, colors.NORMAL) msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments['userfiles'] = set(userFilesList) ## Get the user-specified primary dataset name. primaryDataset = getattr(self.config.Data, 'primaryDataset', 'CRAB_UserFiles') # Normalizes "foo/bar" and "/foo/bar" to "/foo/bar" primaryDataset = "/" + os.path.join(*primaryDataset.split("/")) if not re.match("/%(primDS)s.*" % (lfnParts), primaryDataset): self.logger.warning("Invalid primary dataset name %s; publication may fail." % (primaryDataset)) configArguments['inputdata'] = primaryDataset lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % lumi_mask_name) lumi_list = getLumiList(lumi_mask_name, logger = self.logger) run_ranges = getattr(self.config.Data, 'runRange', None) run_ranges_is_valid = run_ranges is not None and isinstance(run_ranges, str) and re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) else: if len(run_list) > 50000: msg = "Data.runRange includes %s runs." % str(len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs = run_list) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [str(reduce(lambda x,y: x+y, lumi_mask[run]))[1:-1].replace(' ','') for run in configArguments['runs']] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments, isbchecksum
class UserTarball(object): """ _UserTarball_ A subclass of TarFile for the user code tarballs. By default creates a new tarball with the user libraries from lib, module, and the data/ sections of the src/ area. Also adds user specified files in the right place. """ def __init__(self, name=None, mode='w:gz', config=None, logger=None): self.config = config self.logger = logger self.scram = ScramEnvironment(logger=self.logger) self.logger.debug("Making tarball in %s" % name) self.tarfile = tarfile.open(name=name , mode=mode, dereference=True) self.checksum = None def addFiles(self, userFiles=None, cfgOutputName=None): """ Add the necessary files to the tarball """ directories = ['lib', 'module'] dataDirs = ['data'] userFiles = userFiles or [] # Tar up whole directories for directory in directories: fullPath = os.path.join(self.scram.getCmsswBase(), directory) self.logger.debug(" checking directory %s" % fullPath) if os.path.exists(fullPath): self.logger.debug(" adding directory %s to tarball" % fullPath) self.tarfile.add(fullPath, directory, recursive=True) # Search for and tar up "data" directories in src/ srcPath = os.path.join(self.scram.getCmsswBase(), 'src') for root, _dummy, _dummy in os.walk(srcPath): if os.path.basename(root) in dataDirs: directory = root.replace(srcPath,'src') self.logger.debug(" adding data directory %s to tarball" % root) self.tarfile.add(root, directory, recursive=True) # Tar up extra files the user needs for globName in userFiles: fileNames = glob.glob(globName) if not fileNames: raise InputFileNotFoundException('The input file "%s" taken from parameter config.JobType.inputFiles cannot be found' % globName) for filename in fileNames: self.logger.debug(" adding file %s to tarball" % filename) self.tarfile.add(filename, os.path.basename(filename), recursive=True) # Adding the pset file to the tarfile if cfgOutputName: self.tarfile.add(cfgOutputName, arcname='PSet.py') currentPath = os.getcwd() # psetfile = getattr(self.config.JobType, 'psetName', None) # self.tarfile.add(os.path.join(currentPath, psetfile), arcname='PSet.py') def close(self): """ Calculate the checkum and clos """ self.calculateChecksum() return self.tarfile.close() def upload(self): """ Upload the tarball to the Panda Cache """ self.close() archiveName = self.tarfile.name serverUrl = "" self.logger.debug(" uploading archive to cache %s " % archiveName) status,out = PandaInterface.putFile(archiveName, verbose=False, useCacheSrv=True, reuseSandbox=True) if out.startswith('NewFileName:'): # found the same input sandbox to reuse self.logger.debug("out: %s" % out) self.logger.debug("status: %s" % status) self.logger.debug("found the same input sandbox to reuse") archiveName = out.split(':')[-1] serverUrl = "https://%s:%s" % (out.split(':')[-2], '25443') self.logger.debug("archiveName: %s" %archiveName) elif out.startswith('True'): archiveName = out.split(':')[-1] serverUrl = "%s:%s:%s" % (out.split(':')[-4], out.split(':')[-3], out.split(':')[-2]) else: self.logger.error( str(out) ) self.logger.error("failed to upload source files with %s" % status) raise CachefileNotFoundException return serverUrl, archiveName, self.checksum def calculateChecksum(self): """ Calculate a checksum that doesn't depend on the tgz creation data """ lsl = [(x.name, int(x.size), int(x.mtime), x.uname) for x in self.tarfile.getmembers()] hasher = hashlib.md5(str(lsl)) self.logger.debug('tgz contents: %s' % lsl) self.checksum = hasher.hexdigest() self.logger.debug('MD5 checksum: %s' % self.checksum) #Old way reads in the file again. May use for for non-tar files if needed. #sha256sum = hashlib.sha256() #with open(self.tarfile.name, 'rb') as f: #while True: #chunkdata = f.read(8192) #if not chunkdata: #break #sha256sum.update(chunkdata) #sha256sum.hexdigest() def __getattr__(self, *args): """ Pass any unknown functions or attribute requests on to the TarFile object """ self.logger.debug("Passing getattr %s on to TarFile" % args) return self.tarfile.__getattribute__(*args) def __enter__(self): """ Allow use as context manager """ return self def __exit__(self, excType, excValue, excTrace): """ Allow use as context manager """ self.tarfile.close() if excType: return False
def run(self, filecacheurl=None): """ Override run() for JobType """ configArguments = {"addoutputfiles": [], "adduserfiles": [], "tfileoutfiles": [], "edmoutfiles": []} # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({"jobarch": scram.getScramArch(), "jobsw": scram.getCmsswVersion()}) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug("UNIQUE NAME: tarUUID %s " % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID + "default.tgz") cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE) else: raise EnvironmentException("Problem with uuidgen while preparing for Sandbox upload.") else: _dummy, tarFilename = tempfile.mkstemp(suffix=".tgz") _dummy, cfgOutputName = tempfile.mkstemp(suffix="_cfg.py") if getattr(self.config.Data, "inputDataset", None): configArguments["inputdata"] = self.config.Data.inputDataset ## Create CMSSW config. self.logger.debug("self.config: %s" % (self.config)) self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName)) ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent ## in the sense that a second loading of the same pset may not produce the same ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW ## pset twice. However, some "complicated" psets seem to evade the caching. ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that ## it can be reused later if wanted (for example, in PrivateMC when checking if ## the pset has an LHE source) instead of having to load the pset again. ## As for what does "complicated" psets mean, Daniel Riley said that there are ## some psets where one module modifies the configuration from another module. self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## If there is a CMSSW pset, do a basic validation of it. if not bootstrapDone() and self.config.JobType.psetName: valid, msg = self.cmsswCfg.validateConfig() if not valid: raise ConfigurationException(msg) ## We need to put the pickled CMSSW configuration in the right place. ## Here, we determine if the bootstrap script already run and prepared everything ## for us. In such case we move the file, otherwise we pickle.dump the pset if not bootstrapDone(): # Write out CMSSW config self.cmsswCfg.writeFile(cfgOutputName) else: # Move the pickled configuration file created by the bootstrap script self.moveCfgFile(cfgOutputName) ## Interrogate the CMSSW pset for output files (only output files produced by ## PoolOutputModule or TFileService are identified automatically). Do this ## automatic detection even if JobType.disableAutomaticOutputCollection = True, ## so that we can still classify the output files in EDM, TFile and additional ## output files in the Task DB (and the job ad). ## TODO: Do we really need this classification at all? cmscp and PostJob read ## the FJR to know if an output file is EDM, TFile or other. edmfiles, tfiles = self.cmsswCfg.outputFiles() ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile ## output files that are not listed in JobType.outputFiles. if getattr( self.config.JobType, "disableAutomaticOutputCollection", getParamDefaultValue("JobType.disableAutomaticOutputCollection"), ): outputFiles = [re.sub(r"^file:", "", file) for file in getattr(self.config.JobType, "outputFiles", [])] edmfiles = [file for file in edmfiles if file in outputFiles] tfiles = [file for file in tfiles if file in outputFiles] ## Get the list of additional output files that have to be collected as given ## in JobType.outputFiles, but remove duplicates listed already as EDM files or ## TFiles. addoutputFiles = [ re.sub(r"^file:", "", file) for file in getattr(self.config.JobType, "outputFiles", []) if re.sub(r"^file:", "", file) not in edmfiles + tfiles ] self.logger.debug("The following EDM output files will be collected: %s" % edmfiles) self.logger.debug("The following TFile output files will be collected: %s" % tfiles) self.logger.debug("The following user output files will be collected: %s" % addoutputFiles) configArguments["edmoutfiles"] = edmfiles configArguments["tfileoutfiles"] = tfiles configArguments["addoutputfiles"].extend(addoutputFiles) ## Give warning message in case no output file was detected in the CMSSW pset ## nor was any specified in the CRAB configuration. if ( not configArguments["edmoutfiles"] and not configArguments["tfileoutfiles"] and not configArguments["addoutputfiles"] ): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) if getattr( self.config.JobType, "disableAutomaticOutputCollection", getParamDefaultValue("JobType.disableAutomaticOutputCollection"), ): msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration" msg += " and no output file was explicitly specified in the CRAB configuration." else: msg += " CRAB could not detect any output file in the CMSSW configuration" msg += " nor was any explicitly specified in the CRAB configuration." msg += " Hence CRAB will not collect any output file from this task." self.logger.warning(msg) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [re.sub(r"^file:", "", file) for file in getattr(self.config.JobType, "inputFiles", [])] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments["adduserfiles"] = [os.path.basename(f) for f in inputFiles] uploadResults = tb.upload(filecacheurl=filecacheurl) self.logger.debug("Result uploading input files: %s " % str(uploadResults)) configArguments["cacheurl"] = filecacheurl configArguments["cachefilename"] = uploadResults[0] isbchecksum = uploadResults[1] # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, "userInputFiles", None) if userFilesList: self.logger.debug("Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments["userfiles"] = set(userFilesList) ## Get the user-specified primary dataset name. primaryDataset = getattr(self.config.Data, "primaryDataset", "CRAB_UserFiles") # Normalizes "foo/bar" and "/foo/bar" to "/foo/bar" primaryDataset = "/" + os.path.join(*primaryDataset.split("/")) if not re.match("/%(primDS)s.*" % (lfnParts), primaryDataset): self.logger.warning("Invalid primary dataset name %s; publication may fail." % (primaryDataset)) configArguments["inputdata"] = primaryDataset lumi_mask_name = getattr(self.config.Data, "lumiMask", None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) try: lumi_list = getLumiList(lumi_mask_name, logger=self.logger) except ValueError as ex: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex) raise ConfigurationException(msg) run_ranges = getattr(self.config.Data, "runRange", None) if run_ranges: run_ranges_is_valid = re.match("^\d+((?!(-\d+-))(\,|\-)\d+)*$", run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) if not lumi_list: msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null." raise ConfigurationException(msg) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str(len(run_list)) msg += ( " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." ) raise ConfigurationException(msg) lumi_list = LumiList(runs=run_list) else: msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'" raise ConfigurationException(msg) if lumi_list: configArguments["runs"] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments["lumis"] = [ str(reduce(lambda x, y: x + y, lumi_mask[run]))[1:-1].replace(" ", "") for run in configArguments["runs"] ] configArguments["jobtype"] = "Analysis" return tarFilename, configArguments, isbchecksum
class UserTarball(object): """ _UserTarball_ A subclass of TarFile for the user code tarballs. By default creates a new tarball with the user libraries from lib, module, and the data/ sections of the src/ area. Also adds user specified files in the right place. """ def __init__(self, name=None, mode='w:gz', config=None, logger=None): self.config = config self.logger = logger self.scram = ScramEnvironment(logger=self.logger) self.logger.debug("Making tarball in %s" % name) self.tarfile = tarfile.open(name=name, mode=mode, dereference=True) self.checksum = None def addFiles(self, userFiles=None, cfgOutputName=None): """ Add the necessary files to the tarball """ directories = ['lib', 'module'] if getattr( self.config.JobType, 'sendPythonFolder', configParametersInfo['JobType.sendPythonFolder']['default']): directories.append('python') dataDirs = ['data'] userFiles = userFiles or [] # Tar up whole directories for directory in directories: fullPath = os.path.join(self.scram.getCmsswBase(), directory) self.logger.debug(" checking directory %s" % fullPath) if os.path.exists(fullPath): self.logger.debug(" adding directory %s to tarball" % fullPath) self.checkdirectory(fullPath) self.tarfile.add(fullPath, directory, recursive=True) # Search for and tar up "data" directories in src/ srcPath = os.path.join(self.scram.getCmsswBase(), 'src') for root, _dummy, _dummy in os.walk(srcPath): if os.path.basename(root) in dataDirs: directory = root.replace(srcPath, 'src') self.logger.debug(" adding data directory %s to tarball" % root) self.checkdirectory(root) self.tarfile.add(root, directory, recursive=True) # Tar up extra files the user needs for globName in userFiles: fileNames = glob.glob(globName) if not fileNames: raise InputFileNotFoundException( 'The input file "%s" taken from parameter config.JobType.inputFiles cannot be found' % globName) for filename in fileNames: self.logger.debug(" adding file %s to tarball" % filename) self.checkdirectory(filename) self.tarfile.add(filename, os.path.basename(filename), recursive=True) scriptExe = getattr(self.config.JobType, 'scriptExe', None) if scriptExe: self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe)) # Adding the pset and crabconfig file to the tarfile if cfgOutputName: self.tarfile.add(cfgOutputName, arcname='PSet.py') self.tarfile.add(os.path.splitext(cfgOutputName)[0] + '.pkl', arcname='PSet.pkl') configtmp = tempfile.NamedTemporaryFile(delete=True) configtmp.write(str(self.config)) configtmp.flush() psetfilename = getattr(self.config.JobType, 'psetName', None) if not psetfilename == None: self.tarfile.add(psetfilename, '/debug/originalPSet.py') else: self.logger.debug('Failed to add pset to tarball') self.tarfile.add(configtmp.name, '/debug/crabConfig.py') configtmp.close() def close(self): """ Calculate the checkum and clos """ self.calculateChecksum() return self.tarfile.close() def upload(self, filecacheurl=None): """ Upload the tarball to the File Cache """ self.close() archiveName = self.tarfile.name self.logger.debug(" uploading archive to cache %s " % archiveName) ufc = CRABClient.Emulator.getEmulator('ufc')({ 'endpoint': filecacheurl }) result = ufc.upload(archiveName) if 'hashkey' not in result: self.logger.error("Failed to upload source files: %s" % str(result)) raise CachefileNotFoundException return str(result['hashkey']) + '.tar.gz', self.checksum def calculateChecksum(self): """ Calculate a checksum that doesn't depend on the tgz creation data """ lsl = [(x.name, int(x.size), int(x.mtime), x.uname) for x in self.tarfile.getmembers()] hasher = hashlib.md5(str(lsl)) self.logger.debug('tgz contents: %s' % lsl) self.checksum = hasher.hexdigest() self.logger.debug('MD5 checksum: %s' % self.checksum) #Old way reads in the file again. May use for for non-tar files if needed. #sha256sum = hashlib.sha256() #with open(self.tarfile.name, 'rb') as f: #while True: #chunkdata = f.read(8192) #if not chunkdata: #break #sha256sum.update(chunkdata) #sha256sum.hexdigest() def checkdirectory(self, dir_): #checking for infinite symbolic link loop try: for root, _, files in os.walk(dir_, followlinks=True): for file_ in files: os.stat(os.path.join(root, file_)) except OSError, msg: err = '%sError %s:Infinite directory loop found in: %s \nStderr: %s' % \ (colors.RED, colors.NORMAL, dir_ , msg) raise EnvironmentException(err)
def run(self, filecacheurl = None): """ Override run() for JobType """ configArguments = {'addoutputfiles' : [], 'adduserfiles' : [], 'tfileoutfiles' : [], 'edmoutfiles' : [], } if getattr(self.config.Data, 'useParent', False) and getattr(self.config.Data, 'secondaryInputDataset', None): msg = "Invalid CRAB configuration: Parameters Data.useParent and Data.secondaryInputDataset cannot be used together." raise ConfigurationException(msg) # Get SCRAM environment scram = ScramEnvironment(logger=self.logger) configArguments.update({'jobarch': scram.getScramArch(), 'jobsw': scram.getCmsswVersion()}) # Build tarball if self.workdir: tarUUID = PandaInterface.wrappedUuidGen() self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID) if len(tarUUID): tarFilename = os.path.join(self.workdir, tarUUID + 'default.tgz') debugTarFilename = os.path.join(self.workdir, 'debugFiles.tgz') cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE) else: raise EnvironmentException('Problem with uuidgen while preparing for Sandbox upload.') else: _, tarFilename = tempfile.mkstemp(suffix='.tgz') _, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py') if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset ## Create CMSSW config. self.logger.debug("self.config: %s" % (self.config)) self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName)) ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent ## in the sense that a second loading of the same pset may not produce the same ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW ## pset twice. However, some "complicated" psets seem to evade the caching. ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that ## it can be reused later if wanted (for example, in PrivateMC when checking if ## the pset has an LHE source) instead of having to load the pset again. ## As for what does "complicated" psets mean, Daniel Riley said that there are ## some psets where one module modifies the configuration from another module. self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName) ## If there is a CMSSW pset, do a basic validation of it. if not bootstrapDone() and self.config.JobType.psetName: valid, msg = self.cmsswCfg.validateConfig() if not valid: raise ConfigurationException(msg) ## We need to put the pickled CMSSW configuration in the right place. ## Here, we determine if the bootstrap script already run and prepared everything ## for us. In such case we move the file, otherwise we pickle.dump the pset if not bootstrapDone(): # Write out CMSSW config self.cmsswCfg.writeFile(cfgOutputName) else: # Move the pickled and the configuration files created by the bootstrap script self.moveCfgFile(cfgOutputName) ## Interrogate the CMSSW pset for output files (only output files produced by ## PoolOutputModule or TFileService are identified automatically). Do this ## automatic detection even if JobType.disableAutomaticOutputCollection = True, ## so that we can still classify the output files in EDM, TFile and additional ## output files in the Task DB (and the job ad). ## TODO: Do we really need this classification at all? cmscp and PostJob read ## the FJR to know if an output file is EDM, TFile or other. edmfiles, tfiles = self.cmsswCfg.outputFiles() ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile ## output files that are not listed in JobType.outputFiles. if getattr(self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue('JobType.disableAutomaticOutputCollection')): outputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', [])] edmfiles = [file for file in edmfiles if file in outputFiles] tfiles = [file for file in tfiles if file in outputFiles] ## Get the list of additional output files that have to be collected as given ## in JobType.outputFiles, but remove duplicates listed already as EDM files or ## TFiles. addoutputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles+tfiles] self.logger.debug("The following EDM output files will be collected: %s" % edmfiles) self.logger.debug("The following TFile output files will be collected: %s" % tfiles) self.logger.debug("The following user output files will be collected: %s" % addoutputFiles) configArguments['edmoutfiles'] = edmfiles configArguments['tfileoutfiles'] = tfiles configArguments['addoutputfiles'].extend(addoutputFiles) ## Give warning message in case no output file was detected in the CMSSW pset ## nor was any specified in the CRAB configuration. if not configArguments['edmoutfiles'] and not configArguments['tfileoutfiles'] and not configArguments['addoutputfiles']: msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) if getattr(self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue('JobType.disableAutomaticOutputCollection')): msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration" msg += " and no output file was explicitly specified in the CRAB configuration." else: msg += " CRAB could not detect any output file in the CMSSW configuration" msg += " nor was any explicitly specified in the CRAB configuration." msg += " Hence CRAB will not collect any output file from this task." self.logger.warning(msg) ## UserTarball calls ScramEnvironment which can raise EnvironmentException. ## Since ScramEnvironment is already called above and the exception is not ## handled, we are sure that if we reached this point it will not raise EnvironmentException. ## But otherwise we should take this into account. with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb: inputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', [])] tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName) configArguments['adduserfiles'] = [os.path.basename(f) for f in inputFiles] try: uploadResult = tb.upload(filecacheurl = filecacheurl) except HTTPException as hte: if 'X-Error-Info' in hte.headers: reason = hte.headers['X-Error-Info'] reason_re = re.compile(r'\AFile size is ([0-9]*)B\. This is bigger than the maximum allowed size of ([0-9]*)B\.$') re_match = reason_re.match(reason) if re_match: ISBSize = int(re_match.group(1)) ISBSizeLimit = int(re_match.group(2)) reason = "%sError%s:" % (colors.RED, colors.NORMAL) reason += " Input sanbox size is ~%sMB. This is bigger than the maximum allowed size of %sMB." % (ISBSize/1024/1024, ISBSizeLimit/1024/1024) ISBContent = sorted(tb.content, reverse=True) biggestFileSize = ISBContent[0][0] ndigits = int(math.ceil(math.log(biggestFileSize+1, 10))) reason += "\nInput sanbox content sorted by size[Bytes]:" for (size, name) in ISBContent: reason += ("\n%" + str(ndigits) + "s\t%s") % (size, name) raise ClientException(reason) raise hte except Exception as e: msg = ("Impossible to calculate the checksum of the sandbox tarball.\nError message: %s.\n" "More details can be found in %s" % (e, self.logger.logfile)) LOGGERS['CRAB3'].exception(msg) #the traceback is only printed into the logfile raise ClientException(msg) debugFilesUploadResult = None with UserTarball(name=debugTarFilename, logger=self.logger, config=self.config) as dtb: dtb.addMonFiles() try: debugFilesUploadResult = dtb.upload(filecacheurl = filecacheurl) except Exception as e: msg = ("Problem uploading debug_files.tar.gz.\nError message: %s.\n" "More details can be found in %s" % (e, self.logger.logfile)) LOGGERS['CRAB3'].exception(msg) #the traceback is only printed into the logfile configArguments['cacheurl'] = filecacheurl configArguments['cachefilename'] = "%s.tar.gz" % uploadResult if debugFilesUploadResult is not None: configArguments['debugfilename'] = "%s.tar.gz" % debugFilesUploadResult self.logger.debug("Result uploading input files: %(cachefilename)s " % configArguments) # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, 'userInputFiles', None) if userFilesList: self.logger.debug("Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments['userfiles'] = set(userFilesList) configArguments['primarydataset'] = getattr(self.config.Data, 'outputPrimaryDataset', 'CRAB_UserFiles') lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) try: lumi_list = getLumiList(lumi_mask_name, logger = self.logger) except ValueError as ex: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex) raise ConfigurationException(msg) run_ranges = getattr(self.config.Data, 'runRange', None) if run_ranges: run_ranges_is_valid = re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) if not lumi_list: msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null." raise ConfigurationException(msg) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str(len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs = run_list) else: msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'" raise ConfigurationException(msg) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [str(reduce(lambda x,y: x+y, lumi_mask[run]))[1:-1].replace(' ','') for run in configArguments['runs']] configArguments['jobtype'] = 'Analysis' return tarFilename, configArguments
class UserTarball(object): """ _UserTarball_ A subclass of TarFile for the user code tarballs. By default creates a new tarball with the user libraries from lib, module, and the data/ sections of the src/ area. Also adds user specified files in the right place. """ def __init__(self, name=None, mode='w:gz', config=None, logger=None): self.config = config self.logger = logger self.scram = ScramEnvironment(logger=self.logger) self.logger.debug("Making tarball in %s" % name) self.tarfile = tarfile.open(name=name , mode=mode, dereference=True) self.checksum = None PandaInterface.LOGGER = logging.getLogger('CRAB3:traceback') def addFiles(self, userFiles=None, cfgOutputName=None): """ Add the necessary files to the tarball """ directories = ['lib', 'module'] dataDirs = ['data'] userFiles = userFiles or [] # Tar up whole directories for directory in directories: fullPath = os.path.join(self.scram.getCmsswBase(), directory) self.logger.debug(" checking directory %s" % fullPath) if os.path.exists(fullPath): self.logger.debug(" adding directory %s to tarball" % fullPath) self.checkdirectory(fullPath) self.tarfile.add(fullPath, directory, recursive=True) # Search for and tar up "data" directories in src/ srcPath = os.path.join(self.scram.getCmsswBase(), 'src') for root, _dummy, _dummy in os.walk(srcPath): if os.path.basename(root) in dataDirs: directory = root.replace(srcPath,'src') self.logger.debug(" adding data directory %s to tarball" % root) self.checkdirectory(root) self.tarfile.add(root, directory, recursive=True) # Tar up extra files the user needs for globName in userFiles: fileNames = glob.glob(globName) if not fileNames: raise InputFileNotFoundException('The input file "%s" taken from parameter config.JobType.inputFiles cannot be found' % globName) for filename in fileNames: self.logger.debug(" adding file %s to tarball" % filename) self.checkdirectory(filename) self.tarfile.add(filename, os.path.basename(filename), recursive=True) # Adding the pset file to the tarfile if cfgOutputName: self.tarfile.add(cfgOutputName, arcname='PSet.py') currentPath = os.getcwd() # psetfile = getattr(self.config.JobType, 'psetName', None) # self.tarfile.add(os.path.join(currentPath, psetfile), arcname='PSet.py') def close(self): """ Calculate the checkum and clos """ self.calculateChecksum() return self.tarfile.close() def upload(self): """ Upload the tarball to the File Cache """ self.close() archiveName = self.tarfile.name serverUrl = "" self.logger.debug(" uploading archive to cache %s " % archiveName) ufc = UserFileCache({'endpoint' : self.config.JobType.filecacheurl}) result = ufc.upload(archiveName) if 'hashkey' not in result: self.logger.error("Failed to upload source files: %s" % str(result)) raise CachefileNotFoundException return self.config.JobType.filecacheurl, str(result['hashkey']) + '.tar.gz', self.checksum def calculateChecksum(self): """ Calculate a checksum that doesn't depend on the tgz creation data """ lsl = [(x.name, int(x.size), int(x.mtime), x.uname) for x in self.tarfile.getmembers()] hasher = hashlib.md5(str(lsl)) self.logger.debug('tgz contents: %s' % lsl) self.checksum = hasher.hexdigest() self.logger.debug('MD5 checksum: %s' % self.checksum) #Old way reads in the file again. May use for for non-tar files if needed. #sha256sum = hashlib.sha256() #with open(self.tarfile.name, 'rb') as f: #while True: #chunkdata = f.read(8192) #if not chunkdata: #break #sha256sum.update(chunkdata) #sha256sum.hexdigest() def checkdirectory(self,dir): #checking for infinite symbolic link loop try : for root , _ , files in os.walk(dir, followlinks = True ): for file in files: os.stat(os.path.join(root, file )) except OSError , msg : err = '%sError %s:Infinite directory loop found in: %s \nStderr: %s' % \ (colors.RED, colors.NORMAL, dir , msg) raise EnvironmentException(err)