Ejemplo n.º 1
0
 def checkAutomaticAvail(self, allowedSplitAlgos):
     scram = ScramEnvironment(logger=self.logger)
     major, minor = [
         int(v) for v in scram.getCmsswVersion().split('_', 3)[1:-1]
     ]
     if major > 7 or (major == 7 and minor >= 2):
         allowedSplitAlgos.append('Automatic')
Ejemplo n.º 2
0
    def testInit(self):
        """
        Test constructor
        """

        scram = ScramEnvironment(logger=self.testLogger)
        scram.getCmsswVersion()
Ejemplo n.º 3
0
 def __init__(self, name=None, mode='w:gz', config=None, logger=None):
     self.config = config
     self.logger = logger
     self.scram = ScramEnvironment(logger=self.logger)
     self.logger.debug("Making tarball in %s" % name)
     self.tarfile = tarfile.open(name=name, mode=mode, dereference=True)
     self.checksum = None
Ejemplo n.º 4
0
    def testAccessors(self):
        """
        Test various accessors
        """

        scram = ScramEnvironment(logger=self.testLogger)

        self.assertEqual(scram.getCmsswVersion(), self.version)
        self.assertEqual(scram.getScramArch(),    self.arch)
        self.assertEqual(scram.getCmsswBase(),    self.base)
Ejemplo n.º 5
0
    def testScram(self):
        """
        Test Scram environment
        """

        msg = "You must set up a CMSSW environment first"
        scram = ScramEnvironment(logger=self.logger)
        self.assertNotEqual(scram.getCmsswVersion(), None, msg)
        self.assertNotEqual(scram.getScramArch(), None, msg)
        self.assertNotEqual(scram.getCmsswBase(), None, msg)
Ejemplo n.º 6
0
    def testScram(self):
        """
        Test Scram environment
        """

        msg = "You must set up a CMSSW environment first"
        scram = ScramEnvironment(logger=self.logger)
        self.assertNotEqual(scram.getCmsswVersion(), None, msg)
        self.assertNotEqual(scram.getScramArch(), None, msg)
        self.assertNotEqual(scram.getCmsswBase(), None, msg)
Ejemplo n.º 7
0
 def __init__(self, name=None, mode='w:gz', config=None, logger=None):
     self.config = config
     self.logger = logger
     self.scram = ScramEnvironment(logger=self.logger)
     self.logger.debug("Making tarball in %s" % name)
     self.tarfile = tarfile.open(name=name , mode=mode, dereference=True)
     self.checksum = None
Ejemplo n.º 8
0
 def __init__(self, name=None, mode='w:gz', config=None, logger=None):
     self.config = config
     self.logger = logger
     self.scram = ScramEnvironment(logger=self.logger)
     self.logger.debug("Making tarball in %s" % name)
     self.tarfile = tarfile.open(name=name , mode=mode, dereference=True)
     self.checksum = None
     PandaInterface.LOGGER = logging.getLogger('CRAB3:traceback')
Ejemplo n.º 9
0
class UserTarball(object):
    """
        _UserTarball_

            A subclass of TarFile for the user code tarballs. By default
            creates a new tarball with the user libraries from lib, module,
            and the data/ and interface/ sections of the src/ area.

            Also adds user specified files in the right place.
    """
    def __init__(self, name=None, mode='w:gz', config=None, logger=None):
        self.config = config
        self.logger = logger
        self.scram = ScramEnvironment(logger=self.logger)
        self.logger.debug("Making tarball in %s" % name)
        self.tarfile = tarfile.open(name=name, mode=mode, dereference=True)
        self.checksum = None

    def addFiles(self, userFiles=None, cfgOutputName=None):
        """
        Add the necessary files to the tarball
        """
        directories = ['lib', 'biglib', 'module']
        if getattr(
                self.config.JobType, 'sendPythonFolder',
                configParametersInfo['JobType.sendPythonFolder']['default']):
            directories.append('python')
        # /data/ subdirs contain data files needed by the code
        # /interface/ subdirs contain C++ header files needed e.g. by ROOT6
        dataDirs = ['data', 'interface']
        userFiles = userFiles or []

        # Tar up whole directories
        for directory in directories:
            fullPath = os.path.join(self.scram.getCmsswBase(), directory)
            self.logger.debug("Checking directory %s" % fullPath)
            if os.path.exists(fullPath):
                self.logger.debug("Adding directory %s to tarball" % fullPath)
                self.checkdirectory(fullPath)
                self.tarfile.add(fullPath, directory, recursive=True)

        # Search for and tar up "data" directories in src/
        srcPath = os.path.join(self.scram.getCmsswBase(), 'src')
        for root, _dummy, _dummy in os.walk(srcPath):
            if os.path.basename(root) in dataDirs:
                directory = root.replace(srcPath, 'src')
                self.logger.debug("Adding data directory %s to tarball" % root)
                self.checkdirectory(root)
                self.tarfile.add(root, directory, recursive=True)

        # Tar up extra files the user needs
        for globName in userFiles:
            fileNames = glob.glob(globName)
            if not fileNames:
                raise InputFileNotFoundException(
                    "The input file '%s' taken from parameter config.JobType.inputFiles cannot be found."
                    % globName)
            for filename in fileNames:
                self.logger.debug("Adding file %s to tarball" % filename)
                self.checkdirectory(filename)
                self.tarfile.add(filename,
                                 os.path.basename(filename),
                                 recursive=True)

        scriptExe = getattr(self.config.JobType, 'scriptExe', None)
        if scriptExe:
            self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe))

        # Adding the pset files to the tarfile
        if cfgOutputName:
            basedir = os.path.dirname(cfgOutputName)
            self.tarfile.add(cfgOutputName, arcname=BOOTSTRAP_CFGFILE)
            self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_PKL),
                             arcname=BOOTSTRAP_CFGFILE_PKL)
            self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_DUMP),
                             arcname=BOOTSTRAP_CFGFILE_DUMP)

        #debug directory
        configtmp = tempfile.NamedTemporaryFile(delete=True)
        configtmp.write(str(self.config))
        configtmp.flush()
        psetfilename = getattr(self.config.JobType, 'psetName', None)
        if not psetfilename == None:
            self.tarfile.add(psetfilename, '/debug/originalPSet.py')
        else:
            self.logger.debug('Failed to add pset to tarball')
        self.tarfile.add(configtmp.name, '/debug/crabConfig.py')
        configtmp.close()

    def writeContent(self):
        """Save the content of the tarball"""
        self.content = [(int(x.size), x.name)
                        for x in self.tarfile.getmembers()]

    def close(self):
        """
        Calculate the checkum and close
        """
        self.writeContent()
        return self.tarfile.close()

    def upload(self, filecacheurl=None):
        """
        Upload the tarball to the File Cache
        """
        self.close()
        archiveName = self.tarfile.name
        self.logger.debug(
            "Uploading archive %s to the CRAB cache. Using URI %s" %
            (archiveName, filecacheurl))
        ufc = CRABClient.Emulator.getEmulator('ufc')({
            'endpoint': filecacheurl,
            "pycurl": True
        })
        result = ufc.upload(archiveName, excludeList=USER_SANDBOX_EXCLUSIONS)
        if 'hashkey' not in result:
            self.logger.error("Failed to upload source files: %s" %
                              str(result))
            raise CachefileNotFoundException
        return str(result['hashkey'])

    def checkdirectory(self, dir_):
        #checking for infinite symbolic link loop
        try:
            for root, _, files in os.walk(dir_, followlinks=True):
                for file_ in files:
                    os.stat(os.path.join(root, file_))
        except OSError as msg:
            err = '%sError%s: Infinite directory loop found in: %s \nStderr: %s' % \
                    (colors.RED, colors.NORMAL, dir_ , msg)
            raise EnvironmentException(err)

    def __getattr__(self, *args):
        """
        Pass any unknown functions or attribute requests on to the TarFile object
        """
        self.logger.debug("Passing getattr %s on to TarFile" % args)
        return self.tarfile.__getattribute__(*args)

    def __enter__(self):
        """
        Allow use as context manager
        """
        return self

    def __exit__(self, excType, excValue, excTrace):
        """
        Allow use as context manager
        """
        self.tarfile.close()
        if excType:
            return False
Ejemplo n.º 10
0
class UserTarball(object):
    """
        _UserTarball_

            A subclass of TarFile for the user code tarballs. By default
            creates a new tarball with the user libraries from lib, module,
            and the data/ and interface/ sections of the src/ area.

            Also adds user specified files in the right place.
    """

    def __init__(self, name=None, mode='w:gz', config=None, logger=None):
        self.config = config
        self.logger = logger
        self.scram = ScramEnvironment(logger=self.logger)
        self.logger.debug("Making tarball in %s" % name)
        self.tarfile = tarfile.open(name=name , mode=mode, dereference=True)
        self.checksum = None

    def addFiles(self, userFiles=None, cfgOutputName=None):
        """
        Add the necessary files to the tarball
        """
        directories = ['lib', 'biglib', 'module']
        if getattr(self.config.JobType, 'sendPythonFolder', configParametersInfo['JobType.sendPythonFolder']['default']):
            directories.append('python')
        # /data/ subdirs contain data files needed by the code
        # /interface/ subdirs contain C++ header files needed e.g. by ROOT6
        dataDirs    = ['data','interface']
        userFiles = userFiles or []

        # Tar up whole directories
        for directory in  directories:
            fullPath = os.path.join(self.scram.getCmsswBase(), directory)
            self.logger.debug(" checking directory %s" % fullPath)
            if os.path.exists(fullPath):
                self.logger.debug(" adding directory %s to tarball" % fullPath)
                self.checkdirectory(fullPath)
                self.tarfile.add(fullPath, directory, recursive=True)

        # Search for and tar up "data" directories in src/
        srcPath = os.path.join(self.scram.getCmsswBase(), 'src')
        for root, _dummy, _dummy in os.walk(srcPath):
            if os.path.basename(root) in dataDirs:
                directory = root.replace(srcPath,'src')
                self.logger.debug(" adding data directory %s to tarball" % root)
                self.checkdirectory(root)
                self.tarfile.add(root, directory, recursive=True)

        # Tar up extra files the user needs
        for globName in userFiles:
            fileNames = glob.glob(globName)
            if not fileNames:
                raise InputFileNotFoundException('The input file "%s" taken from parameter config.JobType.inputFiles cannot be found' % globName)
            for filename in fileNames:
                self.logger.debug(" adding file %s to tarball" % filename)
                self.checkdirectory(filename)
                self.tarfile.add(filename, os.path.basename(filename), recursive=True)


        scriptExe = getattr(self.config.JobType, 'scriptExe', None)
        if scriptExe:
            self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe))

        # Adding the pset and crabconfig file to the tarfile
        if cfgOutputName:
            self.tarfile.add(cfgOutputName, arcname='PSet.py')
            self.tarfile.add(os.path.splitext(cfgOutputName)[0]+'.pkl', arcname='PSet.pkl')

        configtmp = tempfile.NamedTemporaryFile(delete=True)
        configtmp.write(str(self.config))
        configtmp.flush()
        psetfilename = getattr(self.config.JobType, 'psetName', None)
        if not psetfilename == None:
            self.tarfile.add(psetfilename,'/debug/originalPSet.py')
        else:
            self.logger.debug('Failed to add pset to tarball')
        self.tarfile.add(configtmp.name, '/debug/crabConfig.py')
        configtmp.close()

    def close(self):
        """
        Calculate the checkum and clos
        """

        self.calculateChecksum()
        return self.tarfile.close()

    def upload(self, filecacheurl=None):
        """
        Upload the tarball to the File Cache
        """
        self.close()
        archiveName = self.tarfile.name
        self.logger.debug(" uploading archive to cache %s " % archiveName)
        ufc = CRABClient.Emulator.getEmulator('ufc')({'endpoint' : filecacheurl})
        result = ufc.upload(archiveName)
        if 'hashkey' not in result:
            self.logger.error("Failed to upload source files: %s" % str(result))
            raise CachefileNotFoundException
        return str(result['hashkey']) + '.tar.gz', self.checksum


    def calculateChecksum(self):
        """
        Calculate a checksum that doesn't depend on the tgz
        creation data
        """
        lsl = [(x.name, int(x.size), int(x.mtime), x.uname) for x in self.tarfile.getmembers()]
        hasher = hashlib.md5(str(lsl))
        self.logger.debug('tgz contents: %s' % lsl)
        self.checksum = hasher.hexdigest()
        self.logger.debug('MD5 checksum: %s' % self.checksum)

        #Old way reads in the file again. May use for for non-tar files if needed.
        #sha256sum = hashlib.sha256()
        #with open(self.tarfile.name, 'rb') as f:
            #while True:
                #chunkdata = f.read(8192)
                #if not chunkdata:
                    #break
                #sha256sum.update(chunkdata)
        #sha256sum.hexdigest()

    def checkdirectory(self, dir_):
        #checking for infinite symbolic link loop
        try:
            for root , _ , files in os.walk(dir_, followlinks = True):
                for file_ in files:
                    os.stat(os.path.join(root, file_ ))
        except OSError as msg:
            err = '%sError%s: Infinite directory loop found in: %s \nStderr: %s' % \
                    (colors.RED, colors.NORMAL, dir_ , msg)
            raise EnvironmentException(err)

    def __getattr__(self, *args):
        """
        Pass any unknown functions or attribute requests on to the TarFile object
        """
        self.logger.debug("Passing getattr %s on to TarFile" % args)
        return self.tarfile.__getattribute__(*args)


    def __enter__(self):
        """
        Allow use as context manager
        """
        return self


    def __exit__(self, excType, excValue, excTrace):
        """
        Allow use as context manager
        """
        self.tarfile.close()
        if excType:
            return False
Ejemplo n.º 11
0
    def run(self, filecacheurl=None):
        """
        Override run() for JobType
        """
        configArguments = {
            'addoutputfiles': [],
            'adduserfiles': [],
            'tfileoutfiles': [],
            'edmoutfiles': [],
        }

        if getattr(self.config.Data, 'useParent', False) and getattr(
                self.config.Data, 'secondaryInputDataset', None):
            msg = "Invalid CRAB configuration: Parameters Data.useParent and Data.secondaryInputDataset cannot be used together."
            raise ConfigurationException(msg)

        # Get SCRAM environment
        scram = ScramEnvironment(logger=self.logger)

        configArguments.update({
            'jobarch': scram.getScramArch(),
            'jobsw': scram.getCmsswVersion()
        })

        # Build tarball
        if self.workdir:
            tarUUID = PandaInterface.wrappedUuidGen()
            self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID)
            if len(tarUUID):
                tarFilename = os.path.join(self.workdir,
                                           tarUUID + 'default.tgz')
                cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE)
            else:
                raise EnvironmentException(
                    'Problem with uuidgen while preparing for Sandbox upload.')
        else:
            _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz')
            _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py')

        if getattr(self.config.Data, 'inputDataset', None):
            configArguments['inputdata'] = self.config.Data.inputDataset

        ## Create CMSSW config.
        self.logger.debug("self.config: %s" % (self.config))
        self.logger.debug("self.config.JobType.psetName: %s" %
                          (self.config.JobType.psetName))
        ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent
        ## in the sense that a second loading of the same pset may not produce the same
        ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW
        ## pset twice. However, some "complicated" psets seem to evade the caching.
        ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that
        ## it can be reused later if wanted (for example, in PrivateMC when checking if
        ## the pset has an LHE source) instead of having to load the pset again.
        ## As for what does "complicated" psets mean, Daniel Riley said that there are
        ## some psets where one module modifies the configuration from another module.
        self.cmsswCfg = CMSSWConfig(config=self.config,
                                    logger=self.logger,
                                    userConfig=self.config.JobType.psetName)

        ## If there is a CMSSW pset, do a basic validation of it.
        if not bootstrapDone() and self.config.JobType.psetName:
            valid, msg = self.cmsswCfg.validateConfig()
            if not valid:
                raise ConfigurationException(msg)

        ## We need to put the pickled CMSSW configuration in the right place.
        ## Here, we determine if the bootstrap script already run and prepared everything
        ## for us. In such case we move the file, otherwise we pickle.dump the pset
        if not bootstrapDone():
            # Write out CMSSW config
            self.cmsswCfg.writeFile(cfgOutputName)
        else:
            # Move the pickled and the configuration files created by the bootstrap script
            self.moveCfgFile(cfgOutputName)

        ## Interrogate the CMSSW pset for output files (only output files produced by
        ## PoolOutputModule or TFileService are identified automatically). Do this
        ## automatic detection even if JobType.disableAutomaticOutputCollection = True,
        ## so that we can still classify the output files in EDM, TFile and additional
        ## output files in the Task DB (and the job ad).
        ## TODO: Do we really need this classification at all? cmscp and PostJob read
        ## the FJR to know if an output file is EDM, TFile or other.
        edmfiles, tfiles = self.cmsswCfg.outputFiles()
        ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile
        ## output files that are not listed in JobType.outputFiles.
        if getattr(
                self.config.JobType, 'disableAutomaticOutputCollection',
                getParamDefaultValue(
                    'JobType.disableAutomaticOutputCollection')):
            outputFiles = [
                re.sub(r'^file:', '', file)
                for file in getattr(self.config.JobType, 'outputFiles', [])
            ]
            edmfiles = [file for file in edmfiles if file in outputFiles]
            tfiles = [file for file in tfiles if file in outputFiles]
        ## Get the list of additional output files that have to be collected as given
        ## in JobType.outputFiles, but remove duplicates listed already as EDM files or
        ## TFiles.
        addoutputFiles = [
            re.sub(r'^file:', '', file)
            for file in getattr(self.config.JobType, 'outputFiles', [])
            if re.sub(r'^file:', '', file) not in edmfiles + tfiles
        ]
        self.logger.debug(
            "The following EDM output files will be collected: %s" % edmfiles)
        self.logger.debug(
            "The following TFile output files will be collected: %s" % tfiles)
        self.logger.debug(
            "The following user output files will be collected: %s" %
            addoutputFiles)
        configArguments['edmoutfiles'] = edmfiles
        configArguments['tfileoutfiles'] = tfiles
        configArguments['addoutputfiles'].extend(addoutputFiles)
        ## Give warning message in case no output file was detected in the CMSSW pset
        ## nor was any specified in the CRAB configuration.
        if not configArguments['edmoutfiles'] and not configArguments[
                'tfileoutfiles'] and not configArguments['addoutputfiles']:
            msg = "%sWarning%s:" % (colors.RED, colors.NORMAL)
            if getattr(
                    self.config.JobType, 'disableAutomaticOutputCollection',
                    getParamDefaultValue(
                        'JobType.disableAutomaticOutputCollection')):
                msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration"
                msg += " and no output file was explicitly specified in the CRAB configuration."
            else:
                msg += " CRAB could not detect any output file in the CMSSW configuration"
                msg += " nor was any explicitly specified in the CRAB configuration."
            msg += " Hence CRAB will not collect any output file from this task."
            self.logger.warning(msg)

        ## UserTarball calls ScramEnvironment which can raise EnvironmentException.
        ## Since ScramEnvironment is already called above and the exception is not
        ## handled, we are sure that if we reached this point it will not raise EnvironmentException.
        ## But otherwise we should take this into account.
        with UserTarball(name=tarFilename,
                         logger=self.logger,
                         config=self.config) as tb:
            inputFiles = [
                re.sub(r'^file:', '', file)
                for file in getattr(self.config.JobType, 'inputFiles', [])
            ]
            tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName)
            configArguments['adduserfiles'] = [
                os.path.basename(f) for f in inputFiles
            ]
            try:
                uploadResult = tb.upload(filecacheurl=filecacheurl)
            except HTTPException as hte:
                if 'X-Error-Info' in hte.headers:
                    reason = hte.headers['X-Error-Info']
                    reason_re = re.compile(
                        r'\AFile size is ([0-9]*)B\. This is bigger than the maximum allowed size of ([0-9]*)B\.$'
                    )
                    re_match = reason_re.match(reason)
                    if re_match:
                        ISBSize = int(re_match.group(1))
                        ISBSizeLimit = int(re_match.group(2))
                        reason = "%sError%s:" % (colors.RED, colors.NORMAL)
                        reason += " Input sanbox size is ~%sMB. This is bigger than the maximum allowed size of %sMB." % (
                            ISBSize / 1024 / 1024, ISBSizeLimit / 1024 / 1024)
                        ISBContent = sorted(tb.content, reverse=True)
                        biggestFileSize = ISBContent[0][0]
                        ndigits = int(
                            math.ceil(math.log(biggestFileSize + 1, 10)))
                        reason += "\nInput sanbox content sorted by size[Bytes]:"
                        for (size, name) in ISBContent:
                            reason += ("\n%" + str(ndigits) + "s\t%s") % (size,
                                                                          name)
                        raise ClientException(reason)
                raise hte
            except Exception as e:
                msg = (
                    "Impossible to calculate the checksum of the sandbox tarball.\nError message: %s.\n"
                    "More details can be found in %s" %
                    (e, self.logger.logfile))
                LOGGERS['CRAB3'].exception(
                    msg)  #the traceback is only printed into the logfile
                raise ClientException(msg)

        configArguments['cacheurl'] = filecacheurl
        configArguments['cachefilename'] = "%s.tar.gz" % uploadResult
        self.logger.debug("Result uploading input files: %(cachefilename)s " %
                          configArguments)

        # Upload list of user-defined input files to process as the primary input
        userFilesList = getattr(self.config.Data, 'userInputFiles', None)
        if userFilesList:
            self.logger.debug(
                "Attaching list of user-specified primary input files.")
            userFilesList = map(string.strip, userFilesList)
            userFilesList = [file for file in userFilesList if file]
            if len(userFilesList) != len(set(userFilesList)):
                msg = "%sWarning%s:" % (colors.RED, colors.NORMAL)
                msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries."
                msg += " Duplicated entries will be removed."
                self.logger.warning(msg)
            configArguments['userfiles'] = set(userFilesList)
            configArguments['primarydataset'] = getattr(
                self.config.Data, 'outputPrimaryDataset', 'CRAB_UserFiles')

        lumi_mask_name = getattr(self.config.Data, 'lumiMask', None)
        lumi_list = None
        if lumi_mask_name:
            self.logger.debug("Attaching lumi mask %s to the request" %
                              (lumi_mask_name))
            try:
                lumi_list = getLumiList(lumi_mask_name, logger=self.logger)
            except ValueError as ex:
                msg = "%sError%s:" % (colors.RED, colors.NORMAL)
                msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name,
                                                              ex)
                raise ConfigurationException(msg)
        run_ranges = getattr(self.config.Data, 'runRange', None)
        if run_ranges:
            run_ranges_is_valid = re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$',
                                           run_ranges)
            if run_ranges_is_valid:
                run_list = getRunList(run_ranges)
                if lumi_list:
                    lumi_list.selectRuns(run_list)
                    if not lumi_list:
                        msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null."
                        raise ConfigurationException(msg)
                else:
                    if len(run_list) > 50000:
                        msg = "CRAB configuration parameter Data.runRange includes %s runs." % str(
                            len(run_list))
                        msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs."
                        raise ConfigurationException(msg)
                    lumi_list = LumiList(runs=run_list)
            else:
                msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'"
                raise ConfigurationException(msg)
        if lumi_list:
            configArguments['runs'] = lumi_list.getRuns()
            ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5'
            lumi_mask = lumi_list.getCompactList()
            configArguments['lumis'] = [
                str(reduce(lambda x, y: x + y,
                           lumi_mask[run]))[1:-1].replace(' ', '')
                for run in configArguments['runs']
            ]

        configArguments['jobtype'] = 'Analysis'

        return tarFilename, configArguments
Ejemplo n.º 12
0
    def run(self, requestConfig):
        """
        Override run() for JobType
        """
        configArguments = {
            'addoutputfiles': [],
            'adduserfiles': [],
            'tfileoutfiles': [],
            'edmoutfiles': [],
        }

        # Get SCRAM environment
        scram = ScramEnvironment(logger=self.logger)

        configArguments.update({
            'jobarch': scram.scramArch,
            'jobsw': scram.cmsswVersion,
        })

        # Build tarball
        if self.workdir:
            tarUUID = PandaInterface.wrappedUuidGen()
            self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID)
            if len(tarUUID):
                tarFilename = os.path.join(self.workdir,
                                           tarUUID + 'default.tgz')
                cfgOutputName = os.path.join(self.workdir, 'CMSSW_cfg.py')
            else:
                raise EnvironmentException(
                    'Problem with uuidgen while preparing for Sandbox upload.')
        else:
            _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz')
            _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py')

        #configArguments['userisburl'] = 'https://'+ self.config.General.ufccacheUrl + '/crabcache/file?hashkey=' + uploadResults['hashkey']#XXX hardcoded
        #configArguments['userisburl'] = 'INSERTuserisburl'#XXX hardcoded
        if getattr(self.config.Data, 'inputDataset', None):
            configArguments['inputdata'] = self.config.Data.inputDataset
#        configArguments['ProcessingVersion'] = getattr(self.config.Data, 'processingVersion', None)

# Create CMSSW config
        self.logger.debug("self.config: %s" % self.config)
        self.logger.debug("self.config.JobType.psetName: %s" %
                          self.config.JobType.psetName)
        cmsswCfg = CMSSWConfig(config=self.config,
                               logger=self.logger,
                               userConfig=self.config.JobType.psetName)

        ## Interogate CMSSW config and user config for output file names. For now no use for EDM files or TFiles here.
        edmfiles, tfiles = cmsswCfg.outputFiles()
        addoutputFiles = [
            re.sub(r'^file:', '', file)
            for file in getattr(self.config.JobType, 'outputFiles', [])
            if re.sub(r'^file:', '', file) not in edmfiles + tfiles
        ]
        self.logger.debug(
            "The following EDM output files will be collected: %s" % edmfiles)
        self.logger.debug(
            "The following TFile output files will be collected: %s" % tfiles)
        self.logger.debug(
            "The following user output files will be collected: %s" %
            addoutputFiles)
        configArguments['edmoutfiles'] = edmfiles
        configArguments['tfileoutfiles'] = tfiles
        configArguments['addoutputfiles'].extend(addoutputFiles)

        # Write out CMSSW config
        cmsswCfg.writeFile(cfgOutputName)

        with UserTarball(name=tarFilename,
                         logger=self.logger,
                         config=self.config) as tb:
            inputFiles = [
                re.sub(r'^file:', '', file)
                for file in getattr(self.config.JobType, 'inputFiles', [])
            ]
            tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName)
            configArguments['adduserfiles'] = [
                os.path.basename(f) for f in inputFiles
            ]
            uploadResults = tb.upload()

        self.logger.debug("Result uploading input files: %s " %
                          str(uploadResults))
        configArguments['cachefilename'] = uploadResults[1]
        configArguments['cacheurl'] = uploadResults[0]
        isbchecksum = uploadResults[2]

        # Upload list of user-defined input files to process as the primary input
        userFileName = getattr(self.config.Data, 'userInputFile', None)
        if userFileName:
            self.logger.debug(
                "Attaching a list of user-specified primary input files from %s."
                % userFileName)
            fnames = []
            for fname in open(userFileName).readlines():
                fnames.append(fname.strip())
            configArguments['userfiles'] = filter(
                lambda x: x, fnames)  #removing whitelines and empty objects

            primDS = getattr(self.config.Data, 'primaryDataset', None)
            if primDS:
                # Normalizes "foo/bar" and "/foo/bar" to "/foo/bar"
                primDS = "/" + os.path.join(*primDS.split("/"))
                if not re.match("/%(primDS)s.*" % lfnParts, primDS):
                    self.logger.warning(
                        "Invalid primary dataset name %s for private MC; publishing may fail"
                        % primDS)
                configArguments['inputdata'] = primDS
            else:
                configArguments['inputdata'] = getattr(self.config.Data,
                                                       'inputDataset',
                                                       '/CRAB_UserFiles')

        lumi_mask_name = getattr(self.config.Data, 'lumiMask', None)
        lumi_list = None
        if lumi_mask_name:
            self.logger.debug("Attaching lumi mask %s to the request" %
                              lumi_mask_name)
            lumi_list = getLumiList(lumi_mask_name, logger=self.logger)
        run_ranges = getattr(self.config.Data, 'runRange', None)
        run_ranges_is_valid = run_ranges is not None and isinstance(
            run_ranges, str) and re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$',
                                          run_ranges)
        if run_ranges_is_valid:
            run_list = getRunList(run_ranges)
            if lumi_list:
                lumi_list.selectRuns(run_list)
            else:
                if len(run_list) > 50000:
                    msg = "Data.runRange includes %s runs." % str(
                        len(run_list))
                    msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs."
                    raise ConfigurationException(msg)
                lumi_list = LumiList(runs=run_list)
        if lumi_list:
            configArguments['runs'] = lumi_list.getRuns()
            ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5'
            lumi_mask = lumi_list.getCompactList()
            configArguments['lumis'] = [
                str(reduce(lambda x, y: x + y,
                           lumi_mask[run]))[1:-1].replace(' ', '')
                for run in configArguments['runs']
            ]

        configArguments['jobtype'] = 'Analysis'

        return tarFilename, configArguments, isbchecksum
Ejemplo n.º 13
0
    def run(self, requestConfig):
        """
        Override run() for JobType
        """
        configArguments = {
            'addoutputfiles': [],
            'adduserfiles': [],
            'tfileoutfiles': [],
            'edmoutfiles': [],
        }

        # Get SCRAM environment
        scram = ScramEnvironment(logger=self.logger)

        configArguments.update({
            'jobarch': scram.scramArch,
            'jobsw': scram.cmsswVersion,
        })

        # Build tarball
        if self.workdir:
            tarFilename = os.path.join(
                self.workdir,
                PandaInterface.wrappedUuidGen() + 'default.tgz')
            cfgOutputName = os.path.join(self.workdir, 'CMSSW_cfg.py')
        else:
            _dummy, tarFilename = tempfile.mkstemp(suffix='.tgz')
            _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py')

        #configArguments['userisburl'] = 'https://'+ self.config.General.ufccacheUrl + '/crabcache/file?hashkey=' + uploadResults['hashkey']#XXX hardcoded
        #configArguments['userisburl'] = 'INSERTuserisburl'#XXX hardcoded
        if getattr(self.config.Data, 'inputDataset', None):
            configArguments['inputdata'] = self.config.Data.inputDataset
#        configArguments['ProcessingVersion'] = getattr(self.config.Data, 'processingVersion', None)

# Create CMSSW config
        self.logger.debug("self.config: %s" % self.config)
        self.logger.debug("self.config.JobType.psetName: %s" %
                          self.config.JobType.psetName)
        cmsswCfg = CMSSWConfig(config=self.config,
                               logger=self.logger,
                               userConfig=self.config.JobType.psetName)

        # Interogate CMSSW config and user config for output file names, for now no use for edmFiles or TFiles here.
        analysisFiles, edmFiles = cmsswCfg.outputFiles()
        self.logger.debug("TFiles %s and EDM Files %s will be collected" %
                          (analysisFiles, edmFiles))
        configArguments['tfileoutfiles'] = analysisFiles
        configArguments['edmoutfiles'] = edmFiles

        outputFiles = getattr(self.config.JobType, 'outputFiles', [])
        self.logger.debug("User files %s will be collected" % outputFiles)
        configArguments['addoutputfiles'].extend(outputFiles)

        # Write out CMSSW config
        cmsswCfg.writeFile(cfgOutputName)

        with UserTarball(name=tarFilename,
                         logger=self.logger,
                         config=self.config) as tb:
            inputFiles = getattr(self.config.JobType, 'inputFiles', [])
            tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName)
            configArguments['adduserfiles'] = [
                os.path.basename(f) for f in inputFiles
            ]
            uploadResults = tb.upload()

        self.logger.debug("Result uploading input files: %s " %
                          str(uploadResults))
        configArguments['cachefilename'] = uploadResults[1]
        configArguments['cacheurl'] = uploadResults[0]
        isbchecksum = uploadResults[2]

        # Upload lumi mask if it exists
        lumiMaskName = getattr(self.config.Data, 'lumiMask', None)
        if lumiMaskName:
            self.logger.debug("Attaching lumi mask %s to the request" %
                              lumiMaskName)
            lumiDict = getLumiMask(config=self.config, logger=self.logger)
            configArguments['runs'] = lumiDict.keys()
            #for each run we'll encode the lumis as a string representing a list of integers
            #[[1,2],[5,5]] ==> '1,2,5,5'
            configArguments['lumis'] = [ str(reduce(lambda x,y: x+y, \
                                            lumiDict[run]))[1:-1].replace(' ','') \
                                            for run in configArguments['runs'] ]

        configArguments['jobtype'] = 'Analysis'

        return tarFilename, configArguments, isbchecksum
Ejemplo n.º 14
0
class UserTarball(object):
    """
        _UserTarball_

            A subclass of TarFile for the user code tarballs. By default
            creates a new tarball with the user libraries from lib, module,
            and the data/ and interface/ sections of the src/ area.

            Also adds user specified files in the right place.
    """

    def __init__(self, name=None, mode='w:bz2', config=None, logger=None, crabserver=None, s3tester=None):
        self.config = config
        self.logger = logger
        self.scram = ScramEnvironment(logger=self.logger)
        self.logger.debug("Making tarball in %s" % name)
        self.tarfile = tarfile.open(name=name, mode=mode, dereference=True)
        self.checksum = None
        self.content = None
        self.crabserver = crabserver
        self.s3tester = s3tester

    def addFiles(self, userFiles=None, cfgOutputName=None):
        """
        Add the necessary files to the tarball
        """
        directories = ['lib', 'biglib', 'module']
        if getattr(self.config.JobType, 'sendPythonFolder', configParametersInfo['JobType.sendPythonFolder']['default']):
            directories.append('python')
            directories.append('cfipython')
        if getattr(self.config.JobType, 'sendExternalFolder', configParametersInfo['JobType.sendExternalFolder']['default']):
            externalDirPath = os.path.join(self.scram.getCmsswBase(), 'external')
            if os.path.exists(externalDirPath) and os.listdir(externalDirPath) != []:
                directories.append('external')
            else:
                self.logger.info("The config.JobType.sendExternalFolder parameter is set to True but the external directory "\
                                  "doesn't exist or is empty, not adding to tarball. Path: %s" % externalDirPath)

        # Note that dataDirs are only looked-for and added under the src/ folder.
        # /data/ subdirs contain data files needed by the code
        # /interface/ subdirs contain C++ header files needed e.g. by ROOT6
        dataDirs = ['data', 'interface']
        userFiles = userFiles or []

        # Tar up whole directories
        for directory in directories:
            fullPath = os.path.join(self.scram.getCmsswBase(), directory)
            self.logger.debug("Checking directory %s" % fullPath)
            if os.path.exists(fullPath):
                self.logger.debug("Adding directory %s to tarball" % fullPath)
                self.checkdirectory(fullPath)
                self.tarfile.add(fullPath, directory, recursive=True)

        # Search for and tar up "data" directories in src/
        srcPath = os.path.join(self.scram.getCmsswBase(), 'src')
        for root, _, _ in os.walk(srcPath):
            if os.path.basename(root) in dataDirs:
                directory = root.replace(srcPath, 'src')
                self.logger.debug("Adding data directory %s to tarball" % root)
                self.checkdirectory(root)
                self.tarfile.add(root, directory, recursive=True)

        # Tar up extra files the user needs
        for globName in userFiles:
            fileNames = glob.glob(globName)
            if not fileNames:
                raise InputFileNotFoundException("The input file '%s' taken from parameter config.JobType.inputFiles cannot be found." % globName)
            for filename in fileNames:
                self.logger.debug("Adding file %s to tarball" % filename)
                self.checkdirectory(filename)
                self.tarfile.add(filename, os.path.basename(filename), recursive=True)


        scriptExe = getattr(self.config.JobType, 'scriptExe', None)
        if scriptExe:
            self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe))

        # Adding the pset files to the tarfile
        if cfgOutputName:
            basedir = os.path.dirname(cfgOutputName)
            self.tarfile.add(cfgOutputName, arcname=BOOTSTRAP_CFGFILE)
            self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_PKL), arcname=BOOTSTRAP_CFGFILE_PKL)
            self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_DUMP), arcname=BOOTSTRAP_CFGFILE_DUMP)

    def addMonFiles(self):
        """
        Add monitoring files the debug tarball.
        """
        configtmp = tempfile.NamedTemporaryFile(mode='w', delete=True)
        configtmp.write(str(self.config))
        configtmp.flush()
        psetfilename = getattr(self.config.JobType, 'psetName', None)
        if psetfilename:
            self.tarfile.add(psetfilename, '/debug/originalPSet.py')
        else:
            self.logger.debug('Failed to add pset to debug_files.tar.gz')

        self.tarfile.add(configtmp.name, '/debug/crabConfig.py')

        scriptExe = getattr(self.config.JobType, 'scriptExe', None)
        if scriptExe:
            self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe))

        configtmp.close()

    def writeContent(self):
        """Save the content of the tarball"""
        self.content = [(int(x.size), x.name) for x in self.tarfile.getmembers()]


    def close(self):
        """
        Calculate the checkum and close
        """
        self.writeContent()
        return self.tarfile.close()

    def printSortedContent(self):
        """
	To be used for diagnostic printouts
        returns a string containing tarball content as a list of files sorted by size
        already formatted for use in a print statement
        """
        sortedContent = sorted(self.content, reverse=True)
        biggestFileSize = sortedContent[0][0]
        ndigits = int(math.ceil(math.log(biggestFileSize+1, 10)))
        contentList = "\nsandbox content sorted by size[Bytes]:"
        for (size, name) in sortedContent:
            contentList += ("\n%" + str(ndigits) + "s\t%s") % (size, name)
        return contentList

    def upload(self, filecacheurl=None):
        """
        Upload the tarball to the File Cache
        """

        self.close()
        archiveName = self.tarfile.name
        archiveSizeBytes = os.path.getsize(archiveName)

	# in python3 and python2 with __future__ division, double / means integer division
        archiveSizeKB = archiveSizeBytes//1024
        if archiveSizeKB <= 512:
            archiveSize = "%d KB" % archiveSizeKB
        elif archiveSizeKB < 1024*10:
            # in python3 and python2 with __future__ division, single / means floating point division
            archiveSize = "%3f.1 MB" % (archiveSizeKB/1024)
        else:
            archiveSize = "%d MB" % (archiveSizeKB//1024)
        if archiveSizeBytes > FILE_SIZE_LIMIT:
            msg = ("%sError%s: input tarball size %s exceeds maximum allowed limit of %d MB" %
                   (colors.RED, colors.NORMAL, archiveSize, FILE_SIZE_LIMIT//1024//1024))
            msg += self.printSortedContent()
            raise SandboxTooBigException(msg)

        msg = ("Uploading archive %s (%s) to the CRAB cache. Using URI %s" %
               (archiveName, archiveSize, filecacheurl))
        self.logger.debug(msg)

        if 'S3' in filecacheurl.upper():
            # use S3
            # generate a 32char hash like UserFileCache used to do
            hashkey = calculateChecksum(archiveName, exclude=NEW_USER_SANDBOX_EXCLUSIONS)
            # the ".tar.gz" suffix here is forced by other places in the client which add it when
            # storing tarball name in task table. Not very elegant to need to hardcode in several places.
            cachename = "%s.tar.gz" % hashkey
            # current code requires a taskname to extract username. Any dummy one will do
            # next version of RESTCache will get username from cmsweb FE headers
            uploadToS3(crabserver=self.crabserver, objecttype='sandbox', filepath=archiveName,
                       tarballname=cachename, logger=self.logger)
        else:
            # old way using UFC
            ufc = CRABClient.Emulator.getEmulator('ufc')({'endpoint' : filecacheurl, "pycurl": True})
            t1 = time.time()
            result = ufc.upload(archiveName, excludeList=NEW_USER_SANDBOX_EXCLUSIONS)
            ufcSeconds = int(time.time()-t1)
            if 'hashkey' not in result:
                self.logger.error("Failed to upload archive: %s" % str(result))
                raise CachefileNotFoundException
            hashkey = str(result['hashkey'])
            # upload a copy to S3 dev as well, just to stress it a bit, this never raises
            s3report = testS3upload(self.s3tester, archiveName, hashkey, self.logger)
            # report also how long it took uploading to UFC (which surely worked if we are here)
            s3report['ufcseconds'] = ufcSeconds
            # upload S3 test report to crabcache
            reportFile = '/tmp/crabs3report.' + uuid.uuid4().hex
            with open(reportFile, 'w') as fp:
                json.dump(s3report, fp)
            reportName = 'S3-' + s3report['timestamp'] + ':s3report.json'
            try:
                ufc.uploadLog(reportFile, reportName)
                self.logger.debug('Report of S3 upload stored on CrabCache as %s', reportName)
            except Exception as e:
                self.logger.debug(str(e))
            os.remove(reportFile)
        return hashkey


    def checkdirectory(self, dir_):
        #checking for infinite symbolic link loop
        try:
            for root, _, files in os.walk(dir_, followlinks=True):
                for file_ in files:
                    os.stat(os.path.join(root, file_))
        except OSError as msg:
            err = '%sError%s: Infinite directory loop found in: %s \nStderr: %s' % \
                    (colors.RED, colors.NORMAL, dir_, msg)
            raise EnvironmentException(err)


    def __getattr__(self, *args):
        """
        Pass any unknown functions or attribute requests on to the TarFile object
        """
        self.logger.debug("Passing getattr %s on to TarFile" % args)
        return self.tarfile.__getattribute__(*args)


    def __enter__(self):
        """
        Allow use as context manager
        """
        return self


    def __exit__(self, excType, excValue, excTrace):
        """
        Allow use as context manager
        """
        self.tarfile.close()
        if excType:
            return False
Ejemplo n.º 15
0
class UserTarball(object):
    """
        _UserTarball_

            A subclass of TarFile for the user code tarballs. By default
            creates a new tarball with the user libraries from lib, module,
            and the data/ and interface/ sections of the src/ area.

            Also adds user specified files in the right place.
    """

    def __init__(self, name=None, mode='w:gz', config=None, logger=None):
        self.config = config
        self.logger = logger
        self.scram = ScramEnvironment(logger=self.logger)
        self.logger.debug("Making tarball in %s" % name)
        self.tarfile = tarfile.open(name=name , mode=mode, dereference=True)
        self.checksum = None

    def addFiles(self, userFiles=None, cfgOutputName=None):
        """
        Add the necessary files to the tarball
        """
        directories = ['lib', 'biglib', 'module']
        if getattr(self.config.JobType, 'sendPythonFolder', configParametersInfo['JobType.sendPythonFolder']['default']):
            directories.append('python')
        # /data/ subdirs contain data files needed by the code
        # /interface/ subdirs contain C++ header files needed e.g. by ROOT6
        dataDirs    = ['data','interface']
        userFiles = userFiles or []

        # Tar up whole directories
        for directory in directories:
            fullPath = os.path.join(self.scram.getCmsswBase(), directory)
            self.logger.debug("Checking directory %s" % fullPath)
            if os.path.exists(fullPath):
                self.logger.debug("Adding directory %s to tarball" % fullPath)
                self.checkdirectory(fullPath)
                self.tarfile.add(fullPath, directory, recursive=True)

        # Search for and tar up "data" directories in src/
        srcPath = os.path.join(self.scram.getCmsswBase(), 'src')
        for root, _dummy, _dummy in os.walk(srcPath):
            if os.path.basename(root) in dataDirs:
                directory = root.replace(srcPath,'src')
                self.logger.debug("Adding data directory %s to tarball" % root)
                self.checkdirectory(root)
                self.tarfile.add(root, directory, recursive=True)

        # Tar up extra files the user needs
        for globName in userFiles:
            fileNames = glob.glob(globName)
            if not fileNames:
                raise InputFileNotFoundException("The input file '%s' taken from parameter config.JobType.inputFiles cannot be found." % globName)
            for filename in fileNames:
                self.logger.debug("Adding file %s to tarball" % filename)
                self.checkdirectory(filename)
                self.tarfile.add(filename, os.path.basename(filename), recursive=True)


        scriptExe = getattr(self.config.JobType, 'scriptExe', None)
        if scriptExe:
            self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe))

        # Adding the pset files to the tarfile
        if cfgOutputName:
            basedir = os.path.dirname(cfgOutputName)
            self.tarfile.add(cfgOutputName, arcname=BOOTSTRAP_CFGFILE)
            self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_PKL), arcname=BOOTSTRAP_CFGFILE_PKL)
            self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_DUMP), arcname=BOOTSTRAP_CFGFILE_DUMP)

        #debug directory
        configtmp = tempfile.NamedTemporaryFile(delete=True)
        configtmp.write(str(self.config))
        configtmp.flush()
        psetfilename = getattr(self.config.JobType, 'psetName', None)
        if not psetfilename == None:
            self.tarfile.add(psetfilename,'/debug/originalPSet.py')
        else:
            self.logger.debug('Failed to add pset to tarball')
        self.tarfile.add(configtmp.name, '/debug/crabConfig.py')
        configtmp.close()


    def writeContent(self):
        """Save the content of the tarball"""
        self.content = [(int(x.size), x.name) for x in self.tarfile.getmembers()]


    def close(self):
        """
        Calculate the checkum and close
        """
        self.writeContent()
        return self.tarfile.close()


    def upload(self, filecacheurl=None):
        """
        Upload the tarball to the File Cache
        """
        self.close()
        archiveName = self.tarfile.name
        self.logger.debug("Uploading archive %s to the CRAB cache. Using URI %s" % (archiveName, filecacheurl))
        ufc = CRABClient.Emulator.getEmulator('ufc')({'endpoint' : filecacheurl, "pycurl": True})
        result = ufc.upload(archiveName, excludeList = USER_SANDBOX_EXCLUSIONS)
        if 'hashkey' not in result:
            self.logger.error("Failed to upload source files: %s" % str(result))
            raise CachefileNotFoundException
        return str(result['hashkey'])


    def checkdirectory(self, dir_):
        #checking for infinite symbolic link loop
        try:
            for root , _ , files in os.walk(dir_, followlinks = True):
                for file_ in files:
                    os.stat(os.path.join(root, file_ ))
        except OSError as msg:
            err = '%sError%s: Infinite directory loop found in: %s \nStderr: %s' % \
                    (colors.RED, colors.NORMAL, dir_ , msg)
            raise EnvironmentException(err)


    def __getattr__(self, *args):
        """
        Pass any unknown functions or attribute requests on to the TarFile object
        """
        self.logger.debug("Passing getattr %s on to TarFile" % args)
        return self.tarfile.__getattribute__(*args)


    def __enter__(self):
        """
        Allow use as context manager
        """
        return self


    def __exit__(self, excType, excValue, excTrace):
        """
        Allow use as context manager
        """
        self.tarfile.close()
        if excType:
            return False
Ejemplo n.º 16
0
class UserTarball(object):
    """
        _UserTarball_

            A subclass of TarFile for the user code tarballs. By default
            creates a new tarball with the user libraries from lib, module,
            and the data/ and interface/ sections of the src/ area.

            Also adds user specified files in the right place.
    """

    def __init__(self, name=None, mode='w:bz2', config=None, logger=None):
        self.config = config
        self.logger = logger
        self.scram = ScramEnvironment(logger=self.logger)
        self.logger.debug("Making tarball in %s" % name)
        self.tarfile = tarfile.open(name=name , mode=mode, dereference=True)
        self.checksum = None
        self.content = None

    def addFiles(self, userFiles=None, cfgOutputName=None):
        """
        Add the necessary files to the tarball
        """
        directories = ['lib', 'biglib', 'module']
        if getattr(self.config.JobType, 'sendPythonFolder', configParametersInfo['JobType.sendPythonFolder']['default']):
            directories.append('python')
            directories.append('cfipython')
        if getattr(self.config.JobType, 'sendExternalFolder', configParametersInfo['JobType.sendExternalFolder']['default']):
            externalDirPath = os.path.join(self.scram.getCmsswBase(), 'external')
            if os.path.exists(externalDirPath) and os.listdir(externalDirPath) != []:
                directories.append('external')
            else:
                self.logger.info("The config.JobType.sendExternalFolder parameter is set to True but the external directory "\
                                  "doesn't exist or is empty, not adding to tarball. Path: %s" % externalDirPath)

        # Note that dataDirs are only looked-for and added under the src/ folder.
        # /data/ subdirs contain data files needed by the code
        # /interface/ subdirs contain C++ header files needed e.g. by ROOT6
        dataDirs    = ['data','interface']
        userFiles = userFiles or []

        # Tar up whole directories
        for directory in directories:
            fullPath = os.path.join(self.scram.getCmsswBase(), directory)
            self.logger.debug("Checking directory %s" % fullPath)
            if os.path.exists(fullPath):
                self.logger.debug("Adding directory %s to tarball" % fullPath)
                self.checkdirectory(fullPath)
                self.tarfile.add(fullPath, directory, recursive=True)

        # Search for and tar up "data" directories in src/
        srcPath = os.path.join(self.scram.getCmsswBase(), 'src')
        for root, _, _ in os.walk(srcPath):
            if os.path.basename(root) in dataDirs:
                directory = root.replace(srcPath,'src')
                self.logger.debug("Adding data directory %s to tarball" % root)
                self.checkdirectory(root)
                self.tarfile.add(root, directory, recursive=True)

        # Tar up extra files the user needs
        for globName in userFiles:
            fileNames = glob.glob(globName)
            if not fileNames:
                raise InputFileNotFoundException("The input file '%s' taken from parameter config.JobType.inputFiles cannot be found." % globName)
            for filename in fileNames:
                self.logger.debug("Adding file %s to tarball" % filename)
                self.checkdirectory(filename)
                self.tarfile.add(filename, os.path.basename(filename), recursive=True)


        scriptExe = getattr(self.config.JobType, 'scriptExe', None)
        if scriptExe:
            self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe))

        # Adding the pset files to the tarfile
        if cfgOutputName:
            basedir = os.path.dirname(cfgOutputName)
            self.tarfile.add(cfgOutputName, arcname=BOOTSTRAP_CFGFILE)
            self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_PKL), arcname=BOOTSTRAP_CFGFILE_PKL)
            self.tarfile.add(os.path.join(basedir, BOOTSTRAP_CFGFILE_DUMP), arcname=BOOTSTRAP_CFGFILE_DUMP)

    def addMonFiles(self):
        """
        Add monitoring files the debug tarball.
        """
        configtmp = tempfile.NamedTemporaryFile(delete=True)
        configtmp.write(str(self.config))
        configtmp.flush()
        psetfilename = getattr(self.config.JobType, 'psetName', None)
        if not psetfilename == None:
            self.tarfile.add(psetfilename,'/debug/originalPSet.py')
        else:
            self.logger.debug('Failed to add pset to debug_files.tar.gz')

        self.tarfile.add(configtmp.name, '/debug/crabConfig.py')

        scriptExe = getattr(self.config.JobType, 'scriptExe', None)
        if scriptExe:
            self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe))

        configtmp.close()

    def writeContent(self):
        """Save the content of the tarball"""
        self.content = [(int(x.size), x.name) for x in self.tarfile.getmembers()]


    def close(self):
        """
        Calculate the checkum and close
        """
        self.writeContent()
        return self.tarfile.close()

    def printSortedContent(self):
        """
	To be used for diagnostic printouts
        returns a string containing tarball content as a list of files sorted by size
        already formatted for use in a print statement
        """
        sortedContent = sorted(self.content, reverse=True)
        biggestFileSize = sortedContent[0][0]
        ndigits = int(math.ceil(math.log(biggestFileSize+1, 10)))
        contentList  = "\nsandbox content sorted by size[Bytes]:"
        for (size, name) in sortedContent:
            contentList += ("\n%" + str(ndigits) + "s\t%s") % (size, name)
        return contentList


    def upload(self, filecacheurl=None):
        """
        Upload the tarball to the File Cache
        """

        self.close()
        archiveName = self.tarfile.name
        archiveSizeBytes = os.path.getsize(archiveName)

	# in python3 and python2 with __future__ division, double / means integer division
        archiveSizeKB = archiveSizeBytes//1024
        if archiveSizeKB <= 512 :
            archiveSize = "%d KB" % archiveSizeKB
        elif archiveSizeKB < 1024*10 :
            # in python3 and python2 with __future__ division, single / means floating point division
            archiveSize = "%3f.1 MB" % (archiveSizeKB/1024)
        else:
            archiveSize = "%d MB" % (archiveSizeKB//1024)
        if archiveSizeBytes > FILE_SIZE_LIMIT :
            msg=("%sError%s: input tarball size %s exceeds maximum allowed limit of %d MB" % (colors.RED, colors.NORMAL, archiveSize, FILE_SIZE_LIMIT//1024//1024))
            msg += self.printSortedContent()
            raise SandboxTooBigException(msg)

        msg=("Uploading archive %s (%s) to the CRAB cache. Using URI %s" % (archiveName, archiveSize, filecacheurl))
        self.logger.debug(msg)

        ufc = CRABClient.Emulator.getEmulator('ufc')({'endpoint' : filecacheurl, "pycurl": True})
        result = ufc.upload(archiveName, excludeList = NEW_USER_SANDBOX_EXCLUSIONS)
        if 'hashkey' not in result:
            self.logger.error("Failed to upload archive: %s" % str(result))
            raise CachefileNotFoundException
        return str(result['hashkey'])


    def checkdirectory(self, dir_):
        #checking for infinite symbolic link loop
        try:
            for root , _ , files in os.walk(dir_, followlinks = True):
                for file_ in files:
                    os.stat(os.path.join(root, file_ ))
        except OSError as msg:
            err = '%sError%s: Infinite directory loop found in: %s \nStderr: %s' % \
                    (colors.RED, colors.NORMAL, dir_ , msg)
            raise EnvironmentException(err)


    def __getattr__(self, *args):
        """
        Pass any unknown functions or attribute requests on to the TarFile object
        """
        self.logger.debug("Passing getattr %s on to TarFile" % args)
        return self.tarfile.__getattribute__(*args)


    def __enter__(self):
        """
        Allow use as context manager
        """
        return self


    def __exit__(self, excType, excValue, excTrace):
        """
        Allow use as context manager
        """
        self.tarfile.close()
        if excType:
            return False
Ejemplo n.º 17
0
 def checkAutomaticAvail(self, allowedSplitAlgos):
     scram = ScramEnvironment(logger=self.logger)
     major, minor = [int(v) for v in scram.getCmsswVersion().split('_', 3)[1:-1]]
     if major > 7 or (major == 7 and minor >= 2):
         allowedSplitAlgos.append('Automatic')
Ejemplo n.º 18
0
    def run(self, filecacheurl = None):
        """
        Override run() for JobType
        """
        configArguments = {'addoutputfiles'            : [],
                           'adduserfiles'              : [],
                           'tfileoutfiles'             : [],
                           'edmoutfiles'               : [],
                          }

        # Get SCRAM environment
        scram = ScramEnvironment(logger=self.logger)

        configArguments.update({'jobarch'    : scram.scramArch,
                                'jobsw' : scram.cmsswVersion, })

        # Build tarball
        if self.workdir:
            tarUUID =  PandaInterface.wrappedUuidGen()
            self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID)
            if len(tarUUID):
                tarFilename   = os.path.join(self.workdir, tarUUID +'default.tgz')
                cfgOutputName = os.path.join(self.workdir, 'CMSSW_cfg.py')
            else:
                raise EnvironmentException('Problem with uuidgen while preparing for Sandbox upload.')
        else:
            _dummy, tarFilename   = tempfile.mkstemp(suffix='.tgz')
            _dummy, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py')

        if getattr(self.config.Data, 'inputDataset', None):
            configArguments['inputdata'] = self.config.Data.inputDataset
#        configArguments['ProcessingVersion'] = getattr(self.config.Data, 'processingVersion', None)

        # Create CMSSW config
        self.logger.debug("self.config: %s" % self.config)
        self.logger.debug("self.config.JobType.psetName: %s" % self.config.JobType.psetName)
        cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger,
                               userConfig=self.config.JobType.psetName)

        ## Interogate CMSSW config and user config for output file names. For now no use for EDM files or TFiles here.
        edmfiles, tfiles = cmsswCfg.outputFiles()
        addoutputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles+tfiles]
        self.logger.debug("The following EDM output files will be collected: %s" % edmfiles)
        self.logger.debug("The following TFile output files will be collected: %s" % tfiles)
        self.logger.debug("The following user output files will be collected: %s" % addoutputFiles)
        configArguments['edmoutfiles'] = edmfiles
        configArguments['tfileoutfiles'] = tfiles
        configArguments['addoutputfiles'].extend(addoutputFiles)

        # Write out CMSSW config
        cmsswCfg.writeFile(cfgOutputName)

        ## UserTarball calls ScramEnvironment which can raise EnvironmentException.
        ## Since ScramEnvironment is already called above and the exception is not
        ## handled, we are sure that if we reached this point it will not raise EnvironmentException.
        ## But otherwise we should take this into account.
        with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb:
            inputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', [])]
            tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName)
            configArguments['adduserfiles'] = [os.path.basename(f) for f in inputFiles]
            uploadResults = tb.upload(filecacheurl = filecacheurl)

        self.logger.debug("Result uploading input files: %s " % str(uploadResults))
        configArguments['cacheurl'] = filecacheurl
        configArguments['cachefilename'] = uploadResults[0]
        isbchecksum = uploadResults[1]

        # Upload list of user-defined input files to process as the primary input
        userFilesList = getattr(self.config.Data, 'userInputFiles', None)
        if userFilesList:
            self.logger.debug("Attaching list of user-specified primary input files.")
            userFilesList = map(string.strip, userFilesList)
            userFilesList = [file for file in userFilesList if file]
            if len(userFilesList) != len(set(userFilesList)):
                msg  = "%sWarning%s: CRAB configuration parameter Data.userInputFiles contains duplicated entries." % (colors.RED, colors.NORMAL)
                msg += " Duplicated entries will be removed."    
                self.logger.warning(msg)
            configArguments['userfiles'] = set(userFilesList)
            ## Get the user-specified primary dataset name.
            primaryDataset = getattr(self.config.Data, 'primaryDataset', 'CRAB_UserFiles')
            # Normalizes "foo/bar" and "/foo/bar" to "/foo/bar"
            primaryDataset = "/" + os.path.join(*primaryDataset.split("/"))
            if not re.match("/%(primDS)s.*" % (lfnParts), primaryDataset):
                self.logger.warning("Invalid primary dataset name %s; publication may fail." % (primaryDataset))
            configArguments['inputdata'] = primaryDataset

        lumi_mask_name = getattr(self.config.Data, 'lumiMask', None)
        lumi_list = None
        if lumi_mask_name:
            self.logger.debug("Attaching lumi mask %s to the request" % lumi_mask_name)
            lumi_list = getLumiList(lumi_mask_name, logger = self.logger)
        run_ranges = getattr(self.config.Data, 'runRange', None)
        run_ranges_is_valid = run_ranges is not None and isinstance(run_ranges, str) and re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges)
        if run_ranges_is_valid:
            run_list = getRunList(run_ranges)
            if lumi_list:
                lumi_list.selectRuns(run_list)
            else:
                if len(run_list) > 50000:
                    msg  = "Data.runRange includes %s runs." % str(len(run_list))
                    msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs."
                    raise ConfigurationException(msg)
                lumi_list = LumiList(runs = run_list)
        if lumi_list:
            configArguments['runs'] = lumi_list.getRuns()
            ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5'
            lumi_mask = lumi_list.getCompactList()
            configArguments['lumis'] = [str(reduce(lambda x,y: x+y, lumi_mask[run]))[1:-1].replace(' ','') for run in configArguments['runs']]

        configArguments['jobtype'] = 'Analysis'

        return tarFilename, configArguments, isbchecksum
Ejemplo n.º 19
0
class UserTarball(object):
    """
        _UserTarball_

            A subclass of TarFile for the user code tarballs. By default
            creates a new tarball with the user libraries from lib, module,
            and the data/ sections of the src/ area.

            Also adds user specified files in the right place.
    """

    def __init__(self, name=None, mode='w:gz', config=None, logger=None):
        self.config = config
        self.logger = logger
        self.scram = ScramEnvironment(logger=self.logger)
        self.logger.debug("Making tarball in %s" % name)
        self.tarfile = tarfile.open(name=name , mode=mode, dereference=True)
        self.checksum = None

    def addFiles(self, userFiles=None, cfgOutputName=None):
        """
        Add the necessary files to the tarball
        """
        directories = ['lib', 'module']
        dataDirs    = ['data']
        userFiles = userFiles or []

        # Tar up whole directories
        for directory in  directories:
            fullPath = os.path.join(self.scram.getCmsswBase(), directory)
            self.logger.debug(" checking directory %s" % fullPath)
            if os.path.exists(fullPath):
                self.logger.debug(" adding directory %s to tarball" % fullPath)
                self.tarfile.add(fullPath, directory, recursive=True)

        # Search for and tar up "data" directories in src/
        srcPath = os.path.join(self.scram.getCmsswBase(), 'src')
        for root, _dummy, _dummy in os.walk(srcPath):
            if os.path.basename(root) in dataDirs:
                directory = root.replace(srcPath,'src')
                self.logger.debug(" adding data directory %s to tarball" % root)
                self.tarfile.add(root, directory, recursive=True)

        # Tar up extra files the user needs
        for globName in userFiles:
            fileNames = glob.glob(globName)
            if not fileNames:
                raise InputFileNotFoundException('The input file "%s" taken from parameter config.JobType.inputFiles cannot be found' % globName)
            for filename in fileNames:
                self.logger.debug(" adding file %s to tarball" % filename)
                self.tarfile.add(filename, os.path.basename(filename), recursive=True)

        # Adding the pset file to the tarfile
        if cfgOutputName:
            self.tarfile.add(cfgOutputName, arcname='PSet.py')
        currentPath = os.getcwd()

#        psetfile = getattr(self.config.JobType, 'psetName', None)
#        self.tarfile.add(os.path.join(currentPath, psetfile), arcname='PSet.py')

    def close(self):
        """
        Calculate the checkum and clos
        """

        self.calculateChecksum()
        return self.tarfile.close()

    def upload(self):
        """
        Upload the tarball to the Panda Cache
        """
        self.close()
        archiveName = self.tarfile.name
        serverUrl = ""
        self.logger.debug(" uploading archive to cache %s " % archiveName)
        status,out = PandaInterface.putFile(archiveName, verbose=False, useCacheSrv=True, reuseSandbox=True)

        if out.startswith('NewFileName:'):
            # found the same input sandbox to reuse
            self.logger.debug("out: %s" % out)
            self.logger.debug("status: %s" % status)
            self.logger.debug("found the same input sandbox to reuse")
            archiveName = out.split(':')[-1]
            serverUrl = "https://%s:%s" % (out.split(':')[-2], '25443')
            self.logger.debug("archiveName: %s" %archiveName)
        elif out.startswith('True'):
            archiveName = out.split(':')[-1]
            serverUrl = "%s:%s:%s" % (out.split(':')[-4], out.split(':')[-3], out.split(':')[-2])
        else:
            self.logger.error( str(out) )
            self.logger.error("failed to upload source files with %s" % status)
            raise CachefileNotFoundException

        return serverUrl, archiveName, self.checksum

    def calculateChecksum(self):
        """
        Calculate a checksum that doesn't depend on the tgz
        creation data
        """

        lsl = [(x.name, int(x.size), int(x.mtime), x.uname) for x in self.tarfile.getmembers()]
        hasher = hashlib.md5(str(lsl))
        self.logger.debug('tgz contents: %s' % lsl)
        self.checksum = hasher.hexdigest()
        self.logger.debug('MD5 checksum: %s' % self.checksum)

        #Old way reads in the file again. May use for for non-tar files if needed.
        #sha256sum = hashlib.sha256()
        #with open(self.tarfile.name, 'rb') as f:
            #while True:
                #chunkdata = f.read(8192)
                #if not chunkdata:
                    #break
                #sha256sum.update(chunkdata)
        #sha256sum.hexdigest()


    def __getattr__(self, *args):
        """
        Pass any unknown functions or attribute requests on to the TarFile object
        """
        self.logger.debug("Passing getattr %s on to TarFile" % args)
        return self.tarfile.__getattribute__(*args)


    def __enter__(self):
        """
        Allow use as context manager
        """
        return self


    def __exit__(self, excType, excValue, excTrace):
        """
        Allow use as context manager
        """
        self.tarfile.close()
        if excType:
            return False
Ejemplo n.º 20
0
    def run(self, filecacheurl=None):
        """
        Override run() for JobType
        """
        configArguments = {"addoutputfiles": [], "adduserfiles": [], "tfileoutfiles": [], "edmoutfiles": []}

        # Get SCRAM environment
        scram = ScramEnvironment(logger=self.logger)

        configArguments.update({"jobarch": scram.getScramArch(), "jobsw": scram.getCmsswVersion()})

        # Build tarball
        if self.workdir:
            tarUUID = PandaInterface.wrappedUuidGen()
            self.logger.debug("UNIQUE NAME: tarUUID %s " % tarUUID)
            if len(tarUUID):
                tarFilename = os.path.join(self.workdir, tarUUID + "default.tgz")
                cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE)
            else:
                raise EnvironmentException("Problem with uuidgen while preparing for Sandbox upload.")
        else:
            _dummy, tarFilename = tempfile.mkstemp(suffix=".tgz")
            _dummy, cfgOutputName = tempfile.mkstemp(suffix="_cfg.py")

        if getattr(self.config.Data, "inputDataset", None):
            configArguments["inputdata"] = self.config.Data.inputDataset

        ## Create CMSSW config.
        self.logger.debug("self.config: %s" % (self.config))
        self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName))
        ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent
        ## in the sense that a second loading of the same pset may not produce the same
        ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW
        ## pset twice. However, some "complicated" psets seem to evade the caching.
        ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that
        ## it can be reused later if wanted (for example, in PrivateMC when checking if
        ## the pset has an LHE source) instead of having to load the pset again.
        ## As for what does "complicated" psets mean, Daniel Riley said that there are
        ## some psets where one module modifies the configuration from another module.
        self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger, userConfig=self.config.JobType.psetName)

        ## If there is a CMSSW pset, do a basic validation of it.
        if not bootstrapDone() and self.config.JobType.psetName:
            valid, msg = self.cmsswCfg.validateConfig()
            if not valid:
                raise ConfigurationException(msg)

        ## We need to put the pickled CMSSW configuration in the right place.
        ## Here, we determine if the bootstrap script already run and prepared everything
        ## for us. In such case we move the file, otherwise we pickle.dump the pset
        if not bootstrapDone():
            # Write out CMSSW config
            self.cmsswCfg.writeFile(cfgOutputName)
        else:
            # Move the pickled configuration file created by the bootstrap script
            self.moveCfgFile(cfgOutputName)

        ## Interrogate the CMSSW pset for output files (only output files produced by
        ## PoolOutputModule or TFileService are identified automatically). Do this
        ## automatic detection even if JobType.disableAutomaticOutputCollection = True,
        ## so that we can still classify the output files in EDM, TFile and additional
        ## output files in the Task DB (and the job ad).
        ## TODO: Do we really need this classification at all? cmscp and PostJob read
        ## the FJR to know if an output file is EDM, TFile or other.
        edmfiles, tfiles = self.cmsswCfg.outputFiles()
        ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile
        ## output files that are not listed in JobType.outputFiles.
        if getattr(
            self.config.JobType,
            "disableAutomaticOutputCollection",
            getParamDefaultValue("JobType.disableAutomaticOutputCollection"),
        ):
            outputFiles = [re.sub(r"^file:", "", file) for file in getattr(self.config.JobType, "outputFiles", [])]
            edmfiles = [file for file in edmfiles if file in outputFiles]
            tfiles = [file for file in tfiles if file in outputFiles]
        ## Get the list of additional output files that have to be collected as given
        ## in JobType.outputFiles, but remove duplicates listed already as EDM files or
        ## TFiles.
        addoutputFiles = [
            re.sub(r"^file:", "", file)
            for file in getattr(self.config.JobType, "outputFiles", [])
            if re.sub(r"^file:", "", file) not in edmfiles + tfiles
        ]
        self.logger.debug("The following EDM output files will be collected: %s" % edmfiles)
        self.logger.debug("The following TFile output files will be collected: %s" % tfiles)
        self.logger.debug("The following user output files will be collected: %s" % addoutputFiles)
        configArguments["edmoutfiles"] = edmfiles
        configArguments["tfileoutfiles"] = tfiles
        configArguments["addoutputfiles"].extend(addoutputFiles)
        ## Give warning message in case no output file was detected in the CMSSW pset
        ## nor was any specified in the CRAB configuration.
        if (
            not configArguments["edmoutfiles"]
            and not configArguments["tfileoutfiles"]
            and not configArguments["addoutputfiles"]
        ):
            msg = "%sWarning%s:" % (colors.RED, colors.NORMAL)
            if getattr(
                self.config.JobType,
                "disableAutomaticOutputCollection",
                getParamDefaultValue("JobType.disableAutomaticOutputCollection"),
            ):
                msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration"
                msg += " and no output file was explicitly specified in the CRAB configuration."
            else:
                msg += " CRAB could not detect any output file in the CMSSW configuration"
                msg += " nor was any explicitly specified in the CRAB configuration."
            msg += " Hence CRAB will not collect any output file from this task."
            self.logger.warning(msg)

        ## UserTarball calls ScramEnvironment which can raise EnvironmentException.
        ## Since ScramEnvironment is already called above and the exception is not
        ## handled, we are sure that if we reached this point it will not raise EnvironmentException.
        ## But otherwise we should take this into account.
        with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb:
            inputFiles = [re.sub(r"^file:", "", file) for file in getattr(self.config.JobType, "inputFiles", [])]
            tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName)
            configArguments["adduserfiles"] = [os.path.basename(f) for f in inputFiles]
            uploadResults = tb.upload(filecacheurl=filecacheurl)

        self.logger.debug("Result uploading input files: %s " % str(uploadResults))
        configArguments["cacheurl"] = filecacheurl
        configArguments["cachefilename"] = uploadResults[0]
        isbchecksum = uploadResults[1]

        # Upload list of user-defined input files to process as the primary input
        userFilesList = getattr(self.config.Data, "userInputFiles", None)
        if userFilesList:
            self.logger.debug("Attaching list of user-specified primary input files.")
            userFilesList = map(string.strip, userFilesList)
            userFilesList = [file for file in userFilesList if file]
            if len(userFilesList) != len(set(userFilesList)):
                msg = "%sWarning%s:" % (colors.RED, colors.NORMAL)
                msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries."
                msg += " Duplicated entries will be removed."
                self.logger.warning(msg)
            configArguments["userfiles"] = set(userFilesList)
            ## Get the user-specified primary dataset name.
            primaryDataset = getattr(self.config.Data, "primaryDataset", "CRAB_UserFiles")
            # Normalizes "foo/bar" and "/foo/bar" to "/foo/bar"
            primaryDataset = "/" + os.path.join(*primaryDataset.split("/"))
            if not re.match("/%(primDS)s.*" % (lfnParts), primaryDataset):
                self.logger.warning("Invalid primary dataset name %s; publication may fail." % (primaryDataset))
            configArguments["inputdata"] = primaryDataset

        lumi_mask_name = getattr(self.config.Data, "lumiMask", None)
        lumi_list = None
        if lumi_mask_name:
            self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name))
            try:
                lumi_list = getLumiList(lumi_mask_name, logger=self.logger)
            except ValueError as ex:
                msg = "%sError%s:" % (colors.RED, colors.NORMAL)
                msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex)
                raise ConfigurationException(msg)
        run_ranges = getattr(self.config.Data, "runRange", None)
        if run_ranges:
            run_ranges_is_valid = re.match("^\d+((?!(-\d+-))(\,|\-)\d+)*$", run_ranges)
            if run_ranges_is_valid:
                run_list = getRunList(run_ranges)
                if lumi_list:
                    lumi_list.selectRuns(run_list)
                    if not lumi_list:
                        msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null."
                        raise ConfigurationException(msg)
                else:
                    if len(run_list) > 50000:
                        msg = "CRAB configuration parameter Data.runRange includes %s runs." % str(len(run_list))
                        msg += (
                            " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs."
                        )
                        raise ConfigurationException(msg)
                    lumi_list = LumiList(runs=run_list)
            else:
                msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'"
                raise ConfigurationException(msg)
        if lumi_list:
            configArguments["runs"] = lumi_list.getRuns()
            ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5'
            lumi_mask = lumi_list.getCompactList()
            configArguments["lumis"] = [
                str(reduce(lambda x, y: x + y, lumi_mask[run]))[1:-1].replace(" ", "")
                for run in configArguments["runs"]
            ]

        configArguments["jobtype"] = "Analysis"

        return tarFilename, configArguments, isbchecksum
Ejemplo n.º 21
0
class UserTarball(object):
    """
        _UserTarball_

            A subclass of TarFile for the user code tarballs. By default
            creates a new tarball with the user libraries from lib, module,
            and the data/ sections of the src/ area.

            Also adds user specified files in the right place.
    """
    def __init__(self, name=None, mode='w:gz', config=None, logger=None):
        self.config = config
        self.logger = logger
        self.scram = ScramEnvironment(logger=self.logger)
        self.logger.debug("Making tarball in %s" % name)
        self.tarfile = tarfile.open(name=name, mode=mode, dereference=True)
        self.checksum = None

    def addFiles(self, userFiles=None, cfgOutputName=None):
        """
        Add the necessary files to the tarball
        """
        directories = ['lib', 'module']
        if getattr(
                self.config.JobType, 'sendPythonFolder',
                configParametersInfo['JobType.sendPythonFolder']['default']):
            directories.append('python')
        dataDirs = ['data']
        userFiles = userFiles or []

        # Tar up whole directories
        for directory in directories:
            fullPath = os.path.join(self.scram.getCmsswBase(), directory)
            self.logger.debug(" checking directory %s" % fullPath)
            if os.path.exists(fullPath):
                self.logger.debug(" adding directory %s to tarball" % fullPath)
                self.checkdirectory(fullPath)
                self.tarfile.add(fullPath, directory, recursive=True)

        # Search for and tar up "data" directories in src/
        srcPath = os.path.join(self.scram.getCmsswBase(), 'src')
        for root, _dummy, _dummy in os.walk(srcPath):
            if os.path.basename(root) in dataDirs:
                directory = root.replace(srcPath, 'src')
                self.logger.debug(" adding data directory %s to tarball" %
                                  root)
                self.checkdirectory(root)
                self.tarfile.add(root, directory, recursive=True)

        # Tar up extra files the user needs
        for globName in userFiles:
            fileNames = glob.glob(globName)
            if not fileNames:
                raise InputFileNotFoundException(
                    'The input file "%s" taken from parameter config.JobType.inputFiles cannot be found'
                    % globName)
            for filename in fileNames:
                self.logger.debug(" adding file %s to tarball" % filename)
                self.checkdirectory(filename)
                self.tarfile.add(filename,
                                 os.path.basename(filename),
                                 recursive=True)

        scriptExe = getattr(self.config.JobType, 'scriptExe', None)
        if scriptExe:
            self.tarfile.add(scriptExe, arcname=os.path.basename(scriptExe))

        # Adding the pset and crabconfig file to the tarfile
        if cfgOutputName:
            self.tarfile.add(cfgOutputName, arcname='PSet.py')
            self.tarfile.add(os.path.splitext(cfgOutputName)[0] + '.pkl',
                             arcname='PSet.pkl')

        configtmp = tempfile.NamedTemporaryFile(delete=True)
        configtmp.write(str(self.config))
        configtmp.flush()
        psetfilename = getattr(self.config.JobType, 'psetName', None)
        if not psetfilename == None:
            self.tarfile.add(psetfilename, '/debug/originalPSet.py')
        else:
            self.logger.debug('Failed to add pset to tarball')
        self.tarfile.add(configtmp.name, '/debug/crabConfig.py')
        configtmp.close()

    def close(self):
        """
        Calculate the checkum and clos
        """

        self.calculateChecksum()
        return self.tarfile.close()

    def upload(self, filecacheurl=None):
        """
        Upload the tarball to the File Cache
        """
        self.close()
        archiveName = self.tarfile.name
        self.logger.debug(" uploading archive to cache %s " % archiveName)
        ufc = CRABClient.Emulator.getEmulator('ufc')({
            'endpoint': filecacheurl
        })
        result = ufc.upload(archiveName)
        if 'hashkey' not in result:
            self.logger.error("Failed to upload source files: %s" %
                              str(result))
            raise CachefileNotFoundException
        return str(result['hashkey']) + '.tar.gz', self.checksum

    def calculateChecksum(self):
        """
        Calculate a checksum that doesn't depend on the tgz
        creation data
        """
        lsl = [(x.name, int(x.size), int(x.mtime), x.uname)
               for x in self.tarfile.getmembers()]
        hasher = hashlib.md5(str(lsl))
        self.logger.debug('tgz contents: %s' % lsl)
        self.checksum = hasher.hexdigest()
        self.logger.debug('MD5 checksum: %s' % self.checksum)

        #Old way reads in the file again. May use for for non-tar files if needed.
        #sha256sum = hashlib.sha256()
        #with open(self.tarfile.name, 'rb') as f:
        #while True:
        #chunkdata = f.read(8192)
        #if not chunkdata:
        #break
        #sha256sum.update(chunkdata)
        #sha256sum.hexdigest()

    def checkdirectory(self, dir_):
        #checking for infinite symbolic link loop
        try:
            for root, _, files in os.walk(dir_, followlinks=True):
                for file_ in files:
                    os.stat(os.path.join(root, file_))

        except OSError, msg:
            err = '%sError %s:Infinite directory loop found in: %s \nStderr: %s' % \
                    (colors.RED, colors.NORMAL, dir_ , msg)
            raise EnvironmentException(err)
Ejemplo n.º 22
0
    def run(self, filecacheurl = None):
        """
        Override run() for JobType
        """
        configArguments = {'addoutputfiles'            : [],
                           'adduserfiles'              : [],
                           'tfileoutfiles'             : [],
                           'edmoutfiles'               : [],
                          }

        if getattr(self.config.Data, 'useParent', False) and getattr(self.config.Data, 'secondaryInputDataset', None):
            msg = "Invalid CRAB configuration: Parameters Data.useParent and Data.secondaryInputDataset cannot be used together."
            raise ConfigurationException(msg)

        # Get SCRAM environment
        scram = ScramEnvironment(logger=self.logger)

        configArguments.update({'jobarch': scram.getScramArch(),
                                'jobsw': scram.getCmsswVersion()})

        # Build tarball
        if self.workdir:
            tarUUID =  PandaInterface.wrappedUuidGen()
            self.logger.debug('UNIQUE NAME: tarUUID %s ' % tarUUID)
            if len(tarUUID):
                tarFilename   = os.path.join(self.workdir, tarUUID + 'default.tgz')
                debugTarFilename = os.path.join(self.workdir, 'debugFiles.tgz')
                cfgOutputName = os.path.join(self.workdir, BOOTSTRAP_CFGFILE)
            else:
                raise EnvironmentException('Problem with uuidgen while preparing for Sandbox upload.')
        else:
            _, tarFilename   = tempfile.mkstemp(suffix='.tgz')
            _, cfgOutputName = tempfile.mkstemp(suffix='_cfg.py')

        if getattr(self.config.Data, 'inputDataset', None):
            configArguments['inputdata'] = self.config.Data.inputDataset

        ## Create CMSSW config.
        self.logger.debug("self.config: %s" % (self.config))
        self.logger.debug("self.config.JobType.psetName: %s" % (self.config.JobType.psetName))
        ## The loading of a CMSSW pset in the CMSSWConfig constructor is not idempotent
        ## in the sense that a second loading of the same pset may not produce the same
        ## result. Therefore there is a cache in CMSSWConfig to avoid loading any CMSSW
        ## pset twice. However, some "complicated" psets seem to evade the caching.
        ## Thus, to be safe, keep the CMSSWConfig instance in a class variable, so that
        ## it can be reused later if wanted (for example, in PrivateMC when checking if
        ## the pset has an LHE source) instead of having to load the pset again.
        ## As for what does "complicated" psets mean, Daniel Riley said that there are
        ## some psets where one module modifies the configuration from another module.
        self.cmsswCfg = CMSSWConfig(config=self.config, logger=self.logger,
                                    userConfig=self.config.JobType.psetName)

        ## If there is a CMSSW pset, do a basic validation of it.
        if not bootstrapDone() and self.config.JobType.psetName:
            valid, msg = self.cmsswCfg.validateConfig()
            if not valid:
                raise ConfigurationException(msg)

        ## We need to put the pickled CMSSW configuration in the right place.
        ## Here, we determine if the bootstrap script already run and prepared everything
        ## for us. In such case we move the file, otherwise we pickle.dump the pset
        if not bootstrapDone():
            # Write out CMSSW config
            self.cmsswCfg.writeFile(cfgOutputName)
        else:
            # Move the pickled and the configuration files created by the bootstrap script
            self.moveCfgFile(cfgOutputName)

        ## Interrogate the CMSSW pset for output files (only output files produced by
        ## PoolOutputModule or TFileService are identified automatically). Do this
        ## automatic detection even if JobType.disableAutomaticOutputCollection = True,
        ## so that we can still classify the output files in EDM, TFile and additional
        ## output files in the Task DB (and the job ad).
        ## TODO: Do we really need this classification at all? cmscp and PostJob read
        ## the FJR to know if an output file is EDM, TFile or other.
        edmfiles, tfiles = self.cmsswCfg.outputFiles()
        ## If JobType.disableAutomaticOutputCollection = True, ignore the EDM and TFile
        ## output files that are not listed in JobType.outputFiles.
        if getattr(self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue('JobType.disableAutomaticOutputCollection')):
            outputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', [])]
            edmfiles = [file for file in edmfiles if file in outputFiles]
            tfiles = [file for file in tfiles if file in outputFiles]
        ## Get the list of additional output files that have to be collected as given
        ## in JobType.outputFiles, but remove duplicates listed already as EDM files or
        ## TFiles.
        addoutputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'outputFiles', []) if re.sub(r'^file:', '', file) not in edmfiles+tfiles]
        self.logger.debug("The following EDM output files will be collected: %s" % edmfiles)
        self.logger.debug("The following TFile output files will be collected: %s" % tfiles)
        self.logger.debug("The following user output files will be collected: %s" % addoutputFiles)
        configArguments['edmoutfiles'] = edmfiles
        configArguments['tfileoutfiles'] = tfiles
        configArguments['addoutputfiles'].extend(addoutputFiles)
        ## Give warning message in case no output file was detected in the CMSSW pset
        ## nor was any specified in the CRAB configuration.
        if not configArguments['edmoutfiles'] and not configArguments['tfileoutfiles'] and not configArguments['addoutputfiles']:
            msg = "%sWarning%s:" % (colors.RED, colors.NORMAL)
            if getattr(self.config.JobType, 'disableAutomaticOutputCollection', getParamDefaultValue('JobType.disableAutomaticOutputCollection')):
                msg += " Automatic detection of output files in the CMSSW configuration is disabled from the CRAB configuration"
                msg += " and no output file was explicitly specified in the CRAB configuration."
            else:
                msg += " CRAB could not detect any output file in the CMSSW configuration"
                msg += " nor was any explicitly specified in the CRAB configuration."
            msg += " Hence CRAB will not collect any output file from this task."
            self.logger.warning(msg)

        ## UserTarball calls ScramEnvironment which can raise EnvironmentException.
        ## Since ScramEnvironment is already called above and the exception is not
        ## handled, we are sure that if we reached this point it will not raise EnvironmentException.
        ## But otherwise we should take this into account.
        with UserTarball(name=tarFilename, logger=self.logger, config=self.config) as tb:
            inputFiles = [re.sub(r'^file:', '', file) for file in getattr(self.config.JobType, 'inputFiles', [])]
            tb.addFiles(userFiles=inputFiles, cfgOutputName=cfgOutputName)
            configArguments['adduserfiles'] = [os.path.basename(f) for f in inputFiles]
            try:
                uploadResult = tb.upload(filecacheurl = filecacheurl)
            except HTTPException as hte:
                if 'X-Error-Info' in hte.headers:
                    reason = hte.headers['X-Error-Info']
                    reason_re = re.compile(r'\AFile size is ([0-9]*)B\. This is bigger than the maximum allowed size of ([0-9]*)B\.$')
                    re_match = reason_re.match(reason)
                    if re_match:
                        ISBSize = int(re_match.group(1))
                        ISBSizeLimit = int(re_match.group(2))
                        reason  = "%sError%s:" % (colors.RED, colors.NORMAL)
                        reason += " Input sanbox size is ~%sMB. This is bigger than the maximum allowed size of %sMB." % (ISBSize/1024/1024, ISBSizeLimit/1024/1024)
                        ISBContent = sorted(tb.content, reverse=True)
                        biggestFileSize = ISBContent[0][0]
                        ndigits = int(math.ceil(math.log(biggestFileSize+1, 10)))
                        reason += "\nInput sanbox content sorted by size[Bytes]:"
                        for (size, name) in ISBContent:
                            reason += ("\n%" + str(ndigits) + "s\t%s") % (size, name)
                        raise ClientException(reason)
                raise hte
            except Exception as e:
                msg = ("Impossible to calculate the checksum of the sandbox tarball.\nError message: %s.\n"
                       "More details can be found in %s" % (e, self.logger.logfile))
                LOGGERS['CRAB3'].exception(msg) #the traceback is only printed into the logfile
                raise ClientException(msg)

        debugFilesUploadResult = None
        with UserTarball(name=debugTarFilename, logger=self.logger, config=self.config) as dtb:
            dtb.addMonFiles()
            try:
                debugFilesUploadResult = dtb.upload(filecacheurl = filecacheurl)
            except Exception as e:
                msg = ("Problem uploading debug_files.tar.gz.\nError message: %s.\n"
                       "More details can be found in %s" % (e, self.logger.logfile))
                LOGGERS['CRAB3'].exception(msg) #the traceback is only printed into the logfile

        configArguments['cacheurl'] = filecacheurl
        configArguments['cachefilename'] = "%s.tar.gz" % uploadResult
        if debugFilesUploadResult is not None:
            configArguments['debugfilename'] = "%s.tar.gz" % debugFilesUploadResult
        self.logger.debug("Result uploading input files: %(cachefilename)s " % configArguments)

        # Upload list of user-defined input files to process as the primary input
        userFilesList = getattr(self.config.Data, 'userInputFiles', None)
        if userFilesList:
            self.logger.debug("Attaching list of user-specified primary input files.")
            userFilesList = map(string.strip, userFilesList)
            userFilesList = [file for file in userFilesList if file]
            if len(userFilesList) != len(set(userFilesList)):
                msg  = "%sWarning%s:" % (colors.RED, colors.NORMAL)
                msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries."
                msg += " Duplicated entries will be removed."
                self.logger.warning(msg)
            configArguments['userfiles'] = set(userFilesList)
            configArguments['primarydataset'] = getattr(self.config.Data, 'outputPrimaryDataset', 'CRAB_UserFiles')

        lumi_mask_name = getattr(self.config.Data, 'lumiMask', None)
        lumi_list = None
        if lumi_mask_name:
            self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name))
            try:
                lumi_list = getLumiList(lumi_mask_name, logger = self.logger)
            except ValueError as ex:
                msg  = "%sError%s:" % (colors.RED, colors.NORMAL)
                msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex)
                raise ConfigurationException(msg)
        run_ranges = getattr(self.config.Data, 'runRange', None)
        if run_ranges:
            run_ranges_is_valid = re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges)
            if run_ranges_is_valid:
                run_list = getRunList(run_ranges)
                if lumi_list:
                    lumi_list.selectRuns(run_list)
                    if not lumi_list:
                        msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null."
                        raise ConfigurationException(msg)
                else:
                    if len(run_list) > 50000:
                        msg  = "CRAB configuration parameter Data.runRange includes %s runs." % str(len(run_list))
                        msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs."
                        raise ConfigurationException(msg)
                    lumi_list = LumiList(runs = run_list)
            else:
                msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'"
                raise ConfigurationException(msg)
        if lumi_list:
            configArguments['runs'] = lumi_list.getRuns()
            ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5'
            lumi_mask = lumi_list.getCompactList()
            configArguments['lumis'] = [str(reduce(lambda x,y: x+y, lumi_mask[run]))[1:-1].replace(' ','') for run in configArguments['runs']]

        configArguments['jobtype'] = 'Analysis'

        return tarFilename, configArguments
Ejemplo n.º 23
0
class UserTarball(object):
    """
        _UserTarball_

            A subclass of TarFile for the user code tarballs. By default
            creates a new tarball with the user libraries from lib, module,
            and the data/ sections of the src/ area.

            Also adds user specified files in the right place.
    """

    def __init__(self, name=None, mode='w:gz', config=None, logger=None):
        self.config = config
        self.logger = logger
        self.scram = ScramEnvironment(logger=self.logger)
        self.logger.debug("Making tarball in %s" % name)
        self.tarfile = tarfile.open(name=name , mode=mode, dereference=True)
        self.checksum = None
        PandaInterface.LOGGER = logging.getLogger('CRAB3:traceback')

    def addFiles(self, userFiles=None, cfgOutputName=None):
        """
        Add the necessary files to the tarball
        """
        directories = ['lib', 'module']
        dataDirs    = ['data']
        userFiles = userFiles or []

        # Tar up whole directories
        for directory in  directories:
            fullPath = os.path.join(self.scram.getCmsswBase(), directory)
            self.logger.debug(" checking directory %s" % fullPath)
            if os.path.exists(fullPath):
                self.logger.debug(" adding directory %s to tarball" % fullPath)
                self.checkdirectory(fullPath)
                self.tarfile.add(fullPath, directory, recursive=True)

        # Search for and tar up "data" directories in src/
        srcPath = os.path.join(self.scram.getCmsswBase(), 'src')
        for root, _dummy, _dummy in os.walk(srcPath):
            if os.path.basename(root) in dataDirs:
                directory = root.replace(srcPath,'src')
                self.logger.debug(" adding data directory %s to tarball" % root)
                self.checkdirectory(root)
                self.tarfile.add(root, directory, recursive=True)

        # Tar up extra files the user needs
        for globName in userFiles:
            fileNames = glob.glob(globName)
            if not fileNames:
                raise InputFileNotFoundException('The input file "%s" taken from parameter config.JobType.inputFiles cannot be found' % globName)
            for filename in fileNames:
                self.logger.debug(" adding file %s to tarball" % filename)
                self.checkdirectory(filename)
                self.tarfile.add(filename, os.path.basename(filename), recursive=True)

        # Adding the pset file to the tarfile
        if cfgOutputName:
            self.tarfile.add(cfgOutputName, arcname='PSet.py')
        currentPath = os.getcwd()

#        psetfile = getattr(self.config.JobType, 'psetName', None)
#        self.tarfile.add(os.path.join(currentPath, psetfile), arcname='PSet.py')

    def close(self):
        """
        Calculate the checkum and clos
        """

        self.calculateChecksum()
        return self.tarfile.close()

    def upload(self):
        """
        Upload the tarball to the File Cache
        """
        self.close()
        archiveName = self.tarfile.name
        serverUrl = ""
        self.logger.debug(" uploading archive to cache %s " % archiveName)
        ufc = UserFileCache({'endpoint' : self.config.JobType.filecacheurl})
        result = ufc.upload(archiveName)
        if 'hashkey' not in result:
            self.logger.error("Failed to upload source files: %s" % str(result))
            raise CachefileNotFoundException
        return self.config.JobType.filecacheurl, str(result['hashkey']) + '.tar.gz', self.checksum


    def calculateChecksum(self):
        """
        Calculate a checksum that doesn't depend on the tgz
        creation data
        """
        lsl = [(x.name, int(x.size), int(x.mtime), x.uname) for x in self.tarfile.getmembers()]
        hasher = hashlib.md5(str(lsl))
        self.logger.debug('tgz contents: %s' % lsl)
        self.checksum = hasher.hexdigest()
        self.logger.debug('MD5 checksum: %s' % self.checksum)

        #Old way reads in the file again. May use for for non-tar files if needed.
        #sha256sum = hashlib.sha256()
        #with open(self.tarfile.name, 'rb') as f:
            #while True:
                #chunkdata = f.read(8192)
                #if not chunkdata:
                    #break
                #sha256sum.update(chunkdata)
        #sha256sum.hexdigest()

    def checkdirectory(self,dir):

        #checking for infinite symbolic link loop
        try :
            for root , _ , files in os.walk(dir, followlinks = True ):
                for file in files:
                    os.stat(os.path.join(root, file ))

        except OSError , msg :
            err = '%sError %s:Infinite directory loop found in: %s \nStderr: %s' % \
                    (colors.RED, colors.NORMAL, dir , msg)
            raise EnvironmentException(err)