Beispiel #1
0
    def httpPost(self, filename):
        """
        _httpPost_

        perform an HTTP POST operation to a webserver

        """
        args = {}

        # Preparing a checksum
        blockSize = 0x10000

        def upd(m, data):
            m.update(data)
            return m

        fd = open(filename, 'rb')
        try:
            contents = iter(lambda: fd.read(blockSize), '')
            m = reduce(upd, contents, md5())
        finally:
            fd.close()

        args['checksum'] = 'md5:%s' % m.hexdigest()
        #args['checksum'] = 'md5:%s' % md5.new(filename).read()).hexdigest()
        args['size'] = os.path.getsize(filename)

        msg = "HTTP Upload is about to start:\n"
        msg += " => URL: %s\n" % self.step.upload.URL
        msg += " => Filename: %s\n" % filename
        logging.info(msg)

        try:
            for uploadURL in self.step.upload.URL.split(';'):
                (headers, data) = self.upload(uploadURL, args, filename)
                msg = 'HTTP upload finished succesfully with response:\n'
                msg += 'Status code: %s\n' % headers.get(
                    "Dqm-Status-Code", None)
                msg += 'Message: %s\n' % headers.get("Dqm-Status-Message",
                                                     None)
                msg += 'Detail: %s\n' % headers.get("Dqm-Status-Detail", None)
                msg += 'Data: %s\n' % str(data)
                logging.info(msg)
        except urllib2.HTTPError as ex:
            msg = 'HTTP upload failed with response:\n'
            msg += 'Status code: %s\n' % ex.hdrs.get("Dqm-Status-Code", None)
            msg += 'Message: %s\n' % ex.hdrs.get("Dqm-Status-Message", None)
            msg += 'Detail: %s\n' % ex.hdrs.get("Dqm-Status-Detail", None)
            msg += 'Error: %s\n' % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60318, "DQMUploadFailure", msg)
        except Exception as ex:
            msg = 'HTTP upload failed with response:\n'
            msg += 'Problem unknown.\n'
            msg += 'Error: %s\n' % str(ex)
            msg += 'Traceback: %s\n' % traceback.format_exc()
            logging.error(msg)
            raise WMExecutionFailure(60318, "DQMUploadFailure", msg)

        return
Beispiel #2
0
    def execute(self, emulator=None):
        """
        _execute_


        """
        stepModule = "WMTaskSpace.%s" % self.stepName
        if (emulator != None):
            return emulator.emulate(self.step, self.job)

        # write the wrapper script to a temporary location
        # I don't pass it directly through os.system because I don't
        # trust that there won't be shell-escape shenanigans with
        # arbitrary input files
        scramSetup = self.step.application.setup.softwareEnvironment
        scramCommand = self.step.application.setup.scramCommand
        scramProject = self.step.application.setup.scramProject
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion
        jobReportXML = self.step.output.jobReport
        cmsswCommand = self.step.application.command.executable
        cmsswConfig = self.step.application.command.configuration
        cmsswArguments = self.step.application.command.arguments

        logging.info("Executing MulticoreCMSSW step")

        multicoreSettings = self.step.application.multicore
        numberOfCores = multicoreSettings.numberOfCores
        logging.info("Multicore configured for %s cores" % numberOfCores)

        #Todo:  Create input file list from job writing filelist one per line to
        filelist = open(multicoreSettings.inputfilelist, 'w')
        for inputFile in self.job['input_files']:
            filelist.write("%s\n" % inputFile['lfn'])
        filelist.close()

        #
        # scram bootstrap
        #
        scram = Scram(
            command=scramCommand,
            version=cmsswVersion,
            initialise=self.step.application.setup.softwareEnvironment,
            directory=self.step.builder.workingDir,
            architecture=scramArch,
        )

        logging.info("Running SCRAM")
        try:
            projectOutcome = scram.project()
        except Exception, ex:
            msg = "Exception raised while running scram.\n"
            msg += str(ex)
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(60513, "ScramSetupFailure", msg)
Beispiel #3
0
    def execute(self, emulator = None):
        """
        _execute_

        """
        #Are we using emulators again?
        if (emulator != None):
            return emulator.emulate( self.step, self.job )

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime  = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))
        seName    = overrides.get('seName',    "srm-cms.cern.ch")
        lfnPrefix = overrides.get('lfnPrefix', "srm://srm-cms.cern.ch:8443/srm/managerv2?SFN=/castor/cern.ch/cms")
        lfnBase   = overrides.get('lfnBase',   "/store/user/jsmith")
        userLogs  = overrides.get('userLogs',  False)
        dontStage = overrides.get('dontStage', False)

        stageOutParams = {"command": "srmv2-lcg",
                          "se-name": seName,  "lfn-prefix": lfnPrefix}
        
        #Okay, we need a stageOut Manager
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            (overrides.has_key('newStageOut') and overrides.get('newStageOut')):
            useNewStageOutCode = True
        try:
            if not useNewStageOutCode:
                # old style
                deleteMgr   = DeleteMgr.DeleteMgr()
                stageInMgr  = StageInMgr.StageInMgr()
                stageOutMgr = StageOutMgr.StageOutMgr(**stageOutParams)
            else:
                # new style
                logging.info("LOGCOLLECT IS USING NEW STAGEOUT CODE")
                stageOutMgr = WMCore.Storage.FileManager.StageOutMgr(
                                    retryPauseTime  = self.step.retryDelay,
                                    numberOfRetries = self.step.retryCount,
                                    **overrides)
                stageInMgr = WMCore.Storage.FileManager.StageInMgr(
                                    retryPauseTime  = self.step.retryDelay,
                                    numberOfRetries = self.step.retryCount )
                deleteMgr = WMCore.Storage.FileManager.DeleteMgr(
                                    retryPauseTime  = self.step.retryDelay,
                                    numberOfRetries = self.step.retryCount )
    
                deleteMgr   = DeleteMgr.DeleteMgr()
                stageInMgr  = StageInMgr.StageInMgr()
                stageOutMgr = StageOutMgr.StageOutMgr(**stageOutParams)
        except StandardError, ex:
            msg = "Unable to load StageIn/Out/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)
Beispiel #4
0
    def __call__(self, task, wmbsJob):
        """
        _operator(task)_

        Load and run executors for all steps in Task, if an emulator is
        available for that step, use it instead.

        """

        myThread = threading.currentThread

        try:
            myThread.watchdogMonitor.setupMonitors(task, wmbsJob)
            myThread.watchdogMonitor.notifyJobStart(task)
        except WMException:
            self.toTaskDirectory()
            raise
        except Exception, ex:
            msg = "Encountered unhandled exception while starting monitors:\n"
            msg += str(ex) + '\n'
            msg += str(traceback.format_exc()) + '\n'
            logging.error(msg)
            self.toTaskDirectory()
            raise WMExecutionFailure(msg)
Beispiel #5
0
class CMSSW(Executor):
    """
    _CMSWW_

    Execute a CMSSW Step

    """
    def pre(self, emulator=None):
        """
        _pre_

        Pre execution checks

        """
        if (emulator != None):
            return emulator.emulatePre(self.step)
        logging.info("Pre-executing CMSSW step")
        if hasattr(self.step.application.configuration, 'configCacheUrl'):
            # means we have a configuration & tweak in the sandbox
            psetFile = self.step.application.command.configuration
            psetTweak = self.step.application.command.psetTweak
            self.stepSpace.getFromSandbox(psetFile)

            if psetTweak:
                self.stepSpace.getFromSandbox(psetTweak)

        if hasattr(self.step, "pileup"):
            self.stepSpace.getFromSandbox("pileupconf.json")

        # if step is multicore Enabled, add the multicore setup
        # that generates the input file manifest and per process splitting
        stepHelper = WMStepHelper(self.step)
        typeHelper = stepHelper.getTypeHelper()
        if typeHelper.multicoreEnabled():
            self.step.runtime.scramPreScripts.append("SetupCMSSWMulticore")

        # add in ths scram env PSet manip script whatever happens
        self.step.runtime.scramPreScripts.append("SetupCMSSWPset")
        return None

    def execute(self, emulator=None):
        """
        _execute_


        """
        stepModule = "WMTaskSpace.%s" % self.stepName
        if (emulator != None):
            return emulator.emulate(self.step, self.job)

        # write the wrapper script to a temporary location
        # I don't pass it directly through os.system because I don't
        # trust that there won't be shell-escape shenanigans with
        # arbitrary input files
        scramSetup = self.step.application.setup.softwareEnvironment
        scramCommand = self.step.application.setup.scramCommand
        scramProject = self.step.application.setup.scramProject
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion
        jobReportXML = self.step.output.jobReport
        cmsswCommand = self.step.application.command.executable
        cmsswConfig = self.step.application.command.configuration
        cmsswArguments = self.step.application.command.arguments
        userTarball = ','.join(self.step.user.inputSandboxes)
        userFiles = ','.join(self.step.user.userFiles)
        logging.info('User files are %s' % userFiles)
        logging.info('User sandboxes are %s' % userTarball)

        multicoreSettings = self.step.application.multicore
        multicoreEnabled = multicoreSettings.enabled
        if multicoreEnabled:
            numberOfCores = multicoreSettings.numberOfCores
            logging.info("***Multicore CMSSW Enabled***")
            logging.info("Multicore configured for %s cores" % numberOfCores)
            #create input file list used to generate manifest
            filelist = open(multicoreSettings.inputfilelist, 'w')
            for inputFile in self.job['input_files']:
                filelist.write("%s\n" % inputFile['lfn'])
            filelist.close()

        logging.info("Executing CMSSW step")

        #
        # set any global environment variables
        #
        try:
            os.environ['FRONTIER_ID'] = 'wmagent_%s' % (
                self.report.data.workload)
        except Exception, ex:
            logging.error('Have critical error in setting FRONTIER_ID: %s' %
                          str(ex))
            logging.error(
                'Continuing, as this is not a critical function yet.')
            pass

        #
        # scram bootstrap
        #
        scram = Scram(
            command=scramCommand,
            version=cmsswVersion,
            initialise=self.step.application.setup.softwareEnvironment,
            directory=self.step.builder.workingDir,
            architecture=scramArch,
        )

        logging.info("Runing SCRAM")
        try:
            projectOutcome = scram.project()
        except Exception, ex:
            msg = "Exception raised while running scram.\n"
            msg += str(ex)
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
Beispiel #6
0
    def multicoreMerge(self, scramRef, applicationStart):
        """
        If this step is a multicore configured step, the output from the subprocesses needs to be merged
        together and the job report updated.

        """
        logging.info("***Multicore Post processing started***")
        #  //
        # // Now go through the reports and merge the outputs
        #//
        outputReports = readMultiJobReports(self.step.output.jobReport,
                                            self.stepName,
                                            self.step.builder.workingDir)
        self.report = None
        mergeBuckets = {}
        inputFiles = {}
        aggregator = ReportUtils.Aggregator()
        #  //
        # // loop over the sub reports and combine the inputs for recording and outputs for merges
        #//
        for o in outputReports:
            # use one of the reports as the template for the master report
            if self.report == None:
                self.report = o
            aggregator.add(o.report.performance)
            # store list of all unique input files into the job, adding them together if they
            # are fragments of the same file
            for inp in o.getAllInputFiles():
                lfn = inp['lfn']
                if not inputFiles.has_key(lfn):
                    inputFiles[lfn] = inp
                else:
                    addFiles(inputFiles[lfn], inp)
            #  //
            # // process the output modules from each subreport
            #//
            for f in o.getAllFiles():
                lfn = f['outputModule']
                if not mergeBuckets.has_key(lfn):
                    mergeBuckets[lfn] = MergeBucket(
                        f['lfn'], f['outputModule'], self.stepName,
                        self.step.builder.workingDir)
                mergeBuckets[lfn].append(f)

        # clean up the master report from the template
        reportData = getattr(self.report.data, self.stepName)
        for src in reportData.input.listSections_():
            delattr(reportData.input, src)
        for omod in reportData.outputModules:
            delattr(reportData.output, omod)
            reportData.outputModules.remove(omod)

        #  //
        # // Add the reduced list of input files to the master report
        #//
        for f in inputFiles.values():
            self.report.addInputFile(f['module_label'], **f)

        #  //
        # // Now roll through the merges, run the merge and edit the master job report with the outcome
        #//
        mergesStart = time.time()
        mergeTiming = []
        for b in mergeBuckets.values():
            # write the merge config file
            thisMergeStarts = time.time()
            b.writeConfig()
            # run the merge as a scram enabled command
            logging.info("    Invoking command: %s" % b.merge_command)
            logfile = "%s.log" % b.merge_pset_file
            retCode = scramRef(b.merge_command, False, logfile,
                               self.step.builder.workingDir)
            if retCode > 0:
                msg = "Error running merge job:\n%s\n" % b.merge_command
                msg += "Merge Config:\n%s\n" % b.mergeConfig()
                msg += "%s\n " % retCode
                msg += scram.diagnostic()
                logging.critical(msg)
                raise WMExecutionFailure(60666, "MulticoreMergeFailure", msg)
            #  //
            # // add report from merge to master
            #//  ToDo: try/except here in case report is missing
            b.editReport(self.report)
            thisMergeTime = time.time() - thisMergeStarts
            mergeTiming.append(thisMergeTime)
        mergesComplete = time.time()
        totalJobTime = mergesComplete - applicationStart
        self.report.report.performance = aggregator.aggregate()
        ReportUtils.updateMulticoreReport(self.report, len(mergeBuckets),
                                          mergesStart, mergesComplete,
                                          totalJobTime, *mergeTiming)

        logging.info("***Multicore Post processing complete***")
        return
Beispiel #7
0
        logging.info("Runing SCRAM")
        try:
            projectOutcome = scram.project()
        except Exception, ex:
            msg = "Exception raised while running scram.\n"
            msg += str(ex)
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        if projectOutcome > 0:
            msg = scram.diagnostic()
            #self.report.addError(60513, "ScramSetupFailure", msg)
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
        runtimeOutcome = scram.runtime()
        if runtimeOutcome > 0:
            msg = scram.diagnostic()
            #self.report.addError(60513, "ScramSetupFailure", msg)
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        #
        # pre scripts
        #
        logging.info("Running PRE scripts")
        for script in self.step.runtime.preScripts:
            # TODO: Exception handling and error handling & logging
            scriptProcess = subprocess.Popen(
Beispiel #8
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        # Are we using emulators again?
        if emulator is not None:
            return emulator.emulate(self.step, self.job)

        logging.info("Steps.Executors.%s.execute called", self.__class__.__name__)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()
        logging.info("Using the following overrides: %s ", overrides)
        # Find alternate stageout location
        self.altLFN = overrides.get('altLFN', None)
        self.failedPreviousStep = overrides.get('previousCmsRunFailure', False)

        logging.info("Step configuration is: %s", self.step)
        # Wait timeout for stageOut
        waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        matchFiles = [
            ".log$",  # matches the scram, wmagent and cmsRun logs
            "FrameworkJobReport.xml",
            "Report.pkl",
            "^PSet.py$",
            "^PSet.pkl$",
            "_condor_std*",  # condor wrapper logs at the pilot top level
        ]

        ignoredDirs = ['Utils', 'WMCore', 'WMSandbox']

        # Okay, we need a stageOut Manager
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
                ('newStageOut' in overrides and overrides.get('newStageOut')):
            useNewStageOutCode = True
        if not useNewStageOutCode:
            # old style
            manager = StageOutMgr.StageOutMgr(**overrides)
            manager.numberOfRetries = self.step.retryCount
            manager.retryPauseTime = self.step.retryDelay
        else:
            # new style
            logging.info("LOGARCHIVE IS USING NEW STAGEOUT CODE")
            manager = WMCore.Storage.FileManager.StageOutMgr(retryPauseTime=self.step.retryDelay,
                                                             numberOfRetries=self.step.retryCount,
                                                             **overrides)

        # Now we need to find all the reports
        # The log search follows this structure: ~pilotArea/jobArea/WMTaskSpaceArea/StepsArea
        # Start looking at the pilot scratch area first, such that we find the condor logs
        # Then look at the job area in order to find the wmagentJob log
        # Finally, at the taskspace area to find the cmsRun/FWJR/PSet files
        pilotScratchDir = os.path.join(self.stepSpace.taskSpace.location, '../../')
        logFilesToArchive = self.findFilesInDirectory(pilotScratchDir, matchFiles, ignoredDirs)

        # What if it's empty?
        if len(logFilesToArchive) == 0:
            msg = "Couldn't find any log files in the job"
            logging.error(msg)
            return logFilesToArchive

        # Now that we've gone through all the steps, we have to tar it out
        tarName = 'logArchive.tar.gz'
        tarBallLocation = os.path.join(self.stepSpace.location, tarName)
        with tarfile.open(tarBallLocation, 'w:gz') as tarBall:
            for fName in logFilesToArchive:
                altName = fName.replace(pilotScratchDir, '', 1)
                tarBall.add(name=fName, arcname=altName)

        fileInfo = {'LFN': self.getLFN(tarName),
                    'PFN': tarBallLocation,
                    'PNN': None,
                    'GUID': None
                    }
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)

        try:
            manager(fileInfo)
            self.report.addOutputModule(moduleName="logArchive")
            (adler32, cksum) = calculateChecksums(tarBallLocation)
            reportFile = {"lfn": fileInfo["LFN"], "pfn": fileInfo["PFN"],
                          "location": fileInfo["PNN"], "module_label": "logArchive",
                          "events": 0, "size": 0, "merged": False,
                          "checksums": {'adler32': adler32, 'cksum': cksum}}
            self.report.addOutputFile(outputModule="logArchive", aFile=reportFile)
        except Alarm:
            msg = "Indefinite hang during stageOut of logArchive"
            logging.error(msg)
            self.report.addError(self.stepName, 60404, "LogArchiveTimeout", msg)
            self.saveReport()
            raise WMExecutionFailure(60404, "LogArchiveTimeout", msg)
        except WMException as ex:
            self.report.addError(self.stepName, 60307, "LogArchiveFailure", str(ex))
            self.saveReport()
            raise ex
        except Exception as ex:
            self.report.addError(self.stepName, 60405, "LogArchiveFailure", str(ex))
            self.saveReport()
            msg = "Failure in transferring logArchive tarball\n"
            logging.exception(msg)
            raise WMException("LogArchiveFailure", message=str(ex))
        signal.alarm(0)

        signal.alarm(waitTime)
        self.sendLogToEOS(overrides, tarBallLocation, useNewStageOutCode)
        signal.alarm(0)

        return
Beispiel #9
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        # Are we using emulators again?
        if emulator != None:
            return emulator.emulate(self.step, self.job)

        logging.info("Steps.Executors.%s.execute called",
                     self.__class__.__name__)

        # Search through steps for analysis files
        for step in self.stepSpace.taskSpace.stepSpaces():
            if step == self.stepName:
                # Don't try to parse your own report; it's not there yet
                continue
            stepLocation = os.path.join(self.stepSpace.taskSpace.location,
                                        step)
            logging.info("Beginning report processing for step %s", step)
            reportLocation = os.path.join(stepLocation, 'Report.pkl')
            if not os.path.isfile(reportLocation):
                logging.error("Cannot find report for step %s in space %s",
                              step, stepLocation)
                continue

            # First, get everything from a file and 'unpersist' it
            stepReport = Report()
            stepReport.unpersist(reportLocation, step)

            # Don't upload nor stage out files from bad steps.
            if not stepReport.stepSuccessful(step):
                continue

            # Pulling out the analysis files from each step
            analysisFiles = stepReport.getAnalysisFilesFromStep(step)

            # make sure all conditions from this job get the same uuid
            uuid = makeUUID()

            condFiles2copy = []
            lumiFiles2copy = []

            # Working on analysis files
            for analysisFile in analysisFiles:

                # deal with sqlite files
                if analysisFile.FileClass == "ALCA":

                    sqlitefile = analysisFile.fileName.replace(
                        'sqlite_file:', '', 1)

                    filenamePrefix = "Run%d@%s@%s" % (
                        self.step.condition.runNumber, analysisFile.inputtag,
                        uuid)
                    filenameDB = filenamePrefix + ".db"
                    filenameTXT = filenamePrefix + ".txt"

                    shutil.copy2(os.path.join(stepLocation, sqlitefile),
                                 filenameDB)

                    textoutput = "prepMetaData %s\n" % analysisFile.prepMetaData
                    textoutput += "prodMetaData %s\n" % analysisFile.prodMetaData

                    with open(filenameTXT, "w") as fout:
                        fout.write(textoutput)

                    os.chmod(
                        filenameDB, stat.S_IREAD | stat.S_IWRITE | stat.S_IRUSR
                        | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP
                        | stat.S_IROTH | stat.S_IWOTH)
                    os.chmod(
                        filenameTXT, stat.S_IREAD | stat.S_IWRITE
                        | stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
                        | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH)

                    condFiles2copy.append(filenameDB)
                    condFiles2copy.append(filenameTXT)

                # deal with text files containing lumi info
                elif analysisFile.FileClass == "ALCATXT":

                    shutil.copy2(
                        os.path.join(stepLocation, analysisFile.fileName),
                        analysisFile.fileName)
                    lumiFiles2copy.append(analysisFile.fileName)

            # copy conditions files out and fake the job report
            addedOutputFJR = False
            if self.step.condition.lfnbase:
                logging.info("Copy out conditions files to %s",
                             self.step.condition.lfnbase)
                for file2copy in condFiles2copy:

                    logging.info("==> copy %s", file2copy)

                    targetLFN = os.path.join(self.step.condition.lfnbase,
                                             file2copy)
                    targetPFN = "root://eoscms//eos/cms%s" % targetLFN

                    command = "env XRD_WRITERECOVERY=0 xrdcp -s -f %s %s" % (
                        file2copy, targetPFN)

                    p = subprocess.Popen(command,
                                         shell=True,
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.STDOUT)
                    output = p.communicate()[0]
                    if p.returncode > 0:
                        msg = 'Failure during condition copy to EOS:\n'
                        msg += '   %s\n' % output
                        logging.error(msg)
                        raise WMExecutionFailure(60319, "AlcaHarvestFailure",
                                                 msg)

                    # add fake output file to job report
                    addedOutputFJR = True
                    stepReport.addOutputFile(self.step.condition.outLabel,
                                             aFile={
                                                 'lfn':
                                                 targetLFN,
                                                 'pfn':
                                                 targetPFN,
                                                 'module_label':
                                                 self.step.condition.outLabel
                                             })

            # copy luminosity files out
            if self.step.luminosity.url:
                logging.info("Copy out luminosity files to %s",
                             self.step.luminosity.url)
                for file2copy in lumiFiles2copy:

                    logging.info("==> copy %s", file2copy)

                    targetPFN = rootUrlJoin(self.step.luminosity.url,
                                            file2copy)
                    if not targetPFN:
                        msg = 'No valid URL for lumi copy:\n'
                        msg += '   %s\n' % self.step.luminosity.url
                        logging.error(msg)
                        raise WMExecutionFailure(60319, "AlcaHarvestFailure",
                                                 msg)

                    command = "env XRD_WRITERECOVERY=0 xrdcp -s -f %s %s" % (
                        file2copy, targetPFN)

                    p = subprocess.Popen(command,
                                         shell=True,
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.STDOUT)
                    output = p.communicate()[0]
                    if p.returncode > 0:
                        msg = 'Failure during copy to EOS:\n'
                        msg += '   %s\n' % output
                        logging.error(msg)
                        raise WMExecutionFailure(60319, "AlcaHarvestFailure",
                                                 msg)

            if not addedOutputFJR:
                # no conditions from AlcaHarvest is a valid result, can
                # happen if calibration algorithms produced no output
                # due to not enough statistics or other reasons
                #
                # add fake placeholder output file to job report
                logging.info(
                    "==> no sqlite files from AlcaHarvest job, creating placeholder file record"
                )
                stepReport.addOutputFile(self.step.condition.outLabel,
                                         aFile={
                                             'lfn':
                                             "/no/output",
                                             'pfn':
                                             "/no/output",
                                             'module_label':
                                             self.step.condition.outLabel
                                         })

            # Am DONE with report
            # Persist it
            stepReport.persist(reportLocation)

        return
Beispiel #10
0
            logging.info(msg)
        except urllib2.HTTPError, ex:
            msg = 'HTTP upload failed with response:\n'
            msg += 'Status code: %s\n' % ex.hdrs.get("Dqm-Status-Code", None)
            msg += 'Message: %s\n' % ex.hdrs.get("Dqm-Status-Message", None)
            msg += 'Detail: %s\n' % ex.hdrs.get("Dqm-Status-Detail", None)
            msg += 'Error: %s\n' % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60318, "DQMUploadFailure", msg)
        except Exception, ex:
            msg = 'HTTP upload failed with response:\n'
            msg += 'Problem unknown.\n'
            msg += 'Error: %s\n' % str(ex)
            msg += 'Traceback: %s\n' % traceback.format_exc()
            logging.error(msg)
            raise WMExecutionFailure(60318, "DQMUploadFailure", msg)

        return

    def filetype(self, filename):
        return guess_type(filename)[0] or 'application/octet-stream'

    def encode(self, args, files):
        """
        Encode form (name, value) and (name, filename, type) elements into
        multi-part/form-data. We don't actually need to know what we are
        uploading here, so just claim it's all text/plain.
        """
        boundary = '----------=_DQM_FILE_BOUNDARY_=-----------'
        (body, crlf) = ('', '\r\n')
        for (key, value) in args.items():
Beispiel #11
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        #Are we using emulators again?
        if (emulator != None):
            return emulator.emulate(self.step, self.job)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get(
            'waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))
        seName = overrides.get('seName', "srm-cms.cern.ch")
        lfnPrefix = overrides.get(
            'lfnPrefix',
            "srm://srm-cms.cern.ch:8443/srm/managerv2?SFN=/castor/cern.ch/cms")
        lfnBase = overrides.get('lfnBase', "/store/user/jsmith")
        userLogs = overrides.get('userLogs', False)
        dontStage = overrides.get('dontStage', False)

        stageOutParams = {
            "command": "srmv2-lcg",
            "se-name": seName,
            "lfn-prefix": lfnPrefix
        }

        #Okay, we need a stageOut Manager
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            ('newStageOut' in overrides and overrides.get('newStageOut')):
            useNewStageOutCode = True
        try:
            if not useNewStageOutCode:
                # old style
                deleteMgr = DeleteMgr.DeleteMgr()
                stageInMgr = StageInMgr.StageInMgr()
                stageOutMgr = StageOutMgr.StageOutMgr(**stageOutParams)
            else:
                # new style
                logging.info("LOGCOLLECT IS USING NEW STAGEOUT CODE")
                stageOutMgr = WMCore.Storage.FileManager.StageOutMgr(
                    retryPauseTime=self.step.retryDelay,
                    numberOfRetries=self.step.retryCount,
                    **overrides)
                stageInMgr = WMCore.Storage.FileManager.StageInMgr(
                    retryPauseTime=self.step.retryDelay,
                    numberOfRetries=self.step.retryCount)
                deleteMgr = WMCore.Storage.FileManager.DeleteMgr(
                    retryPauseTime=self.step.retryDelay,
                    numberOfRetries=self.step.retryCount)

                deleteMgr = DeleteMgr.DeleteMgr()
                stageInMgr = StageInMgr.StageInMgr()
                stageOutMgr = StageOutMgr.StageOutMgr(**stageOutParams)
        except Exception as ex:
            msg = "Unable to load StageIn/Out/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)

        # Now we need the logs
        if not dontStage:  # Don't stage or delete files
            logs = []
            for file in self.job["input_files"]:
                logs.append({"LFN": file["lfn"]})

            readyFiles = []
            for file in logs:
                signal.signal(signal.SIGALRM, alarmHandler)
                signal.alarm(waitTime)
                try:
                    output = stageInMgr(**file)
                    readyFiles.append(output)
                    self.report.addInputFile(sourceName="logArchives",
                                             lfn=file['LFN'])
                except Alarm:
                    msg = "Indefinite hang during stageIn of LogCollect"
                    logging.error(msg)
                    self.report.addError(self.stepName, 60407,
                                         "LogCollectTimeout", msg)
                    self.report.persist("Report.pkl")
                except StageOutFailure as ex:
                    msg = "Unable to StageIn %s" % file['LFN']
                    logging.error(msg)
                    # Don't do anything other then record it
                    self.report.addSkippedFile(file.get('PFN', None),
                                               file['LFN'])
                except Exception as ex:
                    raise

                signal.alarm(0)

            if len(readyFiles) == 0:
                # Then we have no output; all the files failed
                # Panic!
                msg = "No logs staged in during LogCollect step"
                logging.error(msg)
                raise WMExecutionFailure(60312, "LogCollectError", msg)

            now = datetime.datetime.now()
            tarPFN = self.createArchive(readyFiles)
            if userLogs:
                lfn = "%s/%s/logs/%s" % (lfnBase, self.report.data.workload,
                                         os.path.basename(tarPFN))
            else:
                lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (
                    now.year, now.month, "WMAgent", self.report.data.workload,
                    os.path.basename(tarPFN))

            tarInfo = {'LFN': lfn, 'PFN': tarPFN, 'SEName': None, 'GUID': None}

            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(waitTime)
            try:
                stageOutMgr(tarInfo)
                self.report.addOutputFile(outputModule="LogCollect",
                                          file=tarInfo)
            except Alarm:
                msg = "Indefinite hang during stageOut of LogCollect"
                logging.error(msg)
                raise WMExecutionFailure(60409, "LogCollectTimeout", msg)
            except Exception as ex:
                msg = "Unable to stage out log archive:\n"
                msg += str(ex)
                print "MSG: %s" % msg
                raise WMExecutionFailure(60408, "LogCollectStageOutError", msg)
            signal.alarm(0)

            # If we're still here we didn't die on stageOut
            for file in self.job["input_files"]:
                signal.signal(signal.SIGALRM, alarmHandler)
                signal.alarm(waitTime)
                try:
                    fileToDelete = {
                        "LFN": file["lfn"],
                        "PFN": None,
                        "SEName": None,
                        "StageOutCommand": None
                    }
                    deleteMgr(fileToDelete=fileToDelete)
                except Alarm:
                    msg = "Indefinite hang during delete of LogCollect"
                    logging.error(msg)
                    raise WMExecutionFailure(60411, "DeleteTimeout", msg)
                except Exception as ex:
                    msg = "Unable to delete files:\n"
                    msg += str(ex)
                    logging.error(msg)
                    raise WMExecutionFailure(60410, "DeleteError", msg)
                signal.alarm(0)

        # Add to report
        outputRef = getattr(self.report.data, self.stepName)
        if dontStage:
            outputRef.output.pfn = 'NotStaged'
            outputRef.output.location = 'NotStaged'
            outputRef.output.lfn = 'NotStaged'
        else:
            outputRef.output.pfn = tarInfo['PFN']
            outputRef.output.location = tarInfo['SEName']
            outputRef.output.lfn = tarInfo['LFN']
        return
Beispiel #12
0
    def __call__(self, task, wmbsJob):
        """
        _operator(task)_

        Load and run executors for all steps in Task, if an emulator is
        available for that step, use it instead.

        """

        myThread = threading.currentThread

        try:
            myThread.watchdogMonitor.setupMonitors(task, wmbsJob)
            myThread.watchdogMonitor.notifyJobStart(task)
        except WMException:
            self.toTaskDirectory()
            raise
        except Exception as ex:
            msg = "Encountered unhandled exception while starting monitors:\n"
            msg += str(ex) + '\n'
            msg += str(traceback.format_exc()) + '\n'
            logging.error(msg)
            self.toTaskDirectory()
            raise WMExecutionFailure(msg)

        failureUpstream = False
        for step in task.steps().nodeIterator():
            try:
                helper = WMStepHelper(step)
                stepType = helper.stepType()
                stepName = helper.name()
                if failureUpstream:
                    # for chained steps, don't execute further steps if a
                    # failure has already happened
                    helper.addOverride("previousCmsRunFailure", True)

                executor = StepFactory.getStepExecutor(stepType)
                result = self.doExecution(executor, step, wmbsJob)
                logging.info("StepName: %s, StepType: %s, with result: %r",
                             stepName, stepType, result)
                if result:  # can be either None, or the step exit code
                    failureUpstream = True
            except WMException as ex:
                msg = "Encountered error while running ExecuteMaster:\n"
                msg += str(ex) + "\n"
                logging.error(msg)
                self.toTaskDirectory()
                break
            except Exception as ex:
                msg = "Encountered error while running ExecuteMaster:\n"
                msg += str(ex) + "\n"
                msg += str(traceback.format_exc()) + "\n"
                self.toTaskDirectory()
                logging.error(msg)
                break

        try:
            myThread.watchdogMonitor.notifyJobEnd(task)
        except WMException:
            self.toTaskDirectory()
        except Exception as ex:
            msg = "Encountered unhandled exception while ending the job:\n"
            msg += str(ex) + '\n'
            msg += str(traceback.format_exc()) + '\n'
            logging.error(msg)
            self.toTaskDirectory()

        return
Beispiel #13
0
    def __call__(self, task, wmbsJob):
        """
        _operator(task)_

        Load and run executors for all steps in Task, if an emulator is
        available for that step, use it instead.

        """

        myThread = threading.currentThread

        try:
            myThread.watchdogMonitor.setupMonitors(task, wmbsJob)
            myThread.watchdogMonitor.notifyJobStart(task)
        except WMException:
            self.toTaskDirectory()
            raise
        except Exception as ex:
            msg =  "Encountered unhandled exception while starting monitors:\n"
            msg += str(ex) + '\n'
            msg += str(traceback.format_exc()) + '\n'
            logging.error(msg)
            self.toTaskDirectory()
            raise WMExecutionFailure(msg)

        skipToStep = None
        for step in task.steps().nodeIterator():
            try:
                helper = WMStepHelper(step)
                stepType = helper.stepType()
                stepName = helper.name()
                if skipToStep and skipToStep != stepName:
                    # Then we continue until we get to the required step
                    continue
                skipToStep = None # Reset this when we get to the right step
                executor = StepFactory.getStepExecutor(stepType)
                result = self.doExecution(executor, step, wmbsJob)
                if not result == None:
                    skipToStep = result
            except WMException as ex:
                msg = "Encountered error while running ExecuteMaster:\n"
                msg += str(ex) + "\n"
                logging.error(msg)
                self.toTaskDirectory()
                break
            except Exception as ex:
                msg = "Encountered error while running ExecuteMaster:\n"
                msg += str(ex) + "\n"
                msg += str(traceback.format_exc()) + "\n"
                self.toTaskDirectory()
                logging.error(msg)
                break


        try:
            myThread.watchdogMonitor.notifyJobEnd(task)
        except WMException:
            self.toTaskDirectory()
        except Exception as ex:
            msg =  "Encountered unhandled exception while ending the job:\n"
            msg += str(ex) + '\n'
            msg += str(traceback.format_exc()) + '\n'
            logging.error(msg)
            self.toTaskDirectory()

        return
Beispiel #14
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        #Are we using emulators again?
        if emulator != None:
            return emulator.emulate(self.step, self.job)

        # Search through steps for analysis files
        for step in self.stepSpace.taskSpace.stepSpaces():
            if step == self.stepName:
                #Don't try to parse your own report; it's not there yet
                continue
            stepLocation = os.path.join(self.stepSpace.taskSpace.location,
                                        step)
            logging.info("Beginning report processing for step %s" % step)
            reportLocation = os.path.join(stepLocation, 'Report.pkl')
            if not os.path.isfile(reportLocation):
                logging.error("Cannot find report for step %s in space %s" \
                              % (step, stepLocation))
                continue

            # First, get everything from a file and 'unpersist' it
            stepReport = Report()
            stepReport.unpersist(reportLocation, step)

            # Don't upload nor stage out files from bad steps.
            if not stepReport.stepSuccessful(step):
                continue

            # Pulling out the analysis files from each step
            analysisFiles = stepReport.getAnalysisFilesFromStep(step)

            # make sure all conditions from this job get the same uuid
            uuid = makeUUID()

            files2copy = []

            # Working on analysis files
            for analysisFile in analysisFiles:

                # only deal with sqlite files
                if analysisFile.FileClass == "ALCA":

                    sqlitefile = analysisFile.fileName.replace(
                        'sqlite_file:', '', 1)

                    filenamePrefix = "Run%d@%s@%s" % (
                        self.step.condition.runNumber, analysisFile.inputtag,
                        uuid)
                    filenameDB = filenamePrefix + ".db"
                    filenameTXT = filenamePrefix + ".txt"

                    shutil.copy2(os.path.join(stepLocation, sqlitefile),
                                 filenameDB)

                    textoutput = "prepMetaData %s\n" % analysisFile.prepMetaData
                    textoutput += "prodMetaData %s\n" % analysisFile.prodMetaData

                    fout = open(filenameTXT, "w")
                    fout.write(textoutput)
                    fout.close()

                    os.chmod(
                        filenameDB, stat.S_IREAD | stat.S_IWRITE | stat.S_IRUSR
                        | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP
                        | stat.S_IROTH | stat.S_IWOTH)
                    os.chmod(
                        filenameTXT, stat.S_IREAD | stat.S_IWRITE
                        | stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
                        | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH)

                    files2copy.append(filenameDB)
                    files2copy.append(filenameTXT)

            # check and create target directory
            if not os.path.isdir(self.step.condition.dir):
                msg = 'Conditions copy failed with response:\n'
                msg += 'The target dir %s does not exist or is not a directory\n'
                logging.error(msg)
                raise WMExecutionFailure(60319, "AlcaHarvestFailure", msg)

            # copy files out and fake the job report
            logging.info("Copy out conditions files to %s" %
                         self.step.condition.dir)
            for file2copy in files2copy:

                logging.info("==> copy %s" % file2copy)

                targetFile = os.path.join(self.step.condition.dir, file2copy)

                try:
                    shutil.copy2(file2copy, targetFile)
                except Exception, ex:
                    msg = 'Conditions copy failed with response:\n'
                    msg += 'Error: %s\n' % str(ex)
                    logging.error(msg)
                    raise WMExecutionFailure(60319, "AlcaHarvestFailure", msg)

                # add fake output file to job report
                stepReport.addOutputFile(self.step.condition.outLabel,
                                         file={
                                             'lfn':
                                             targetFile,
                                             'pfn':
                                             targetFile,
                                             'module_label':
                                             self.step.condition.outLabel
                                         })

            if len(files2copy) == 0:

                # no output from AlcaHarvest is a valid result, can
                # happen if calibration algorithms produced no output
                # due to not enough statistics or other reasons
                #
                # add fake placeholder output file to job report
                logging.info(
                    "==> no sqlite files from AlcaHarvest job, creating placeholder file record"
                )
                stepReport.addOutputFile(self.step.condition.outLabel,
                                         file={
                                             'lfn':
                                             "/no/output",
                                             'pfn':
                                             "/no/output",
                                             'module_label':
                                             self.step.condition.outLabel
                                         })

            # Am DONE with report
            # Persist it
            stepReport.persist(reportLocation)
Beispiel #15
0
    def httpPost(self, filename):
        """
        _httpPost_

        perform an HTTP POST operation to a webserver

        """
        args = {}

        # Preparing a checksum
        blockSize = 0x10000

        def upd(m, data):
            m.update(data)
            return m

        with open(filename, 'rb') as fd:
            contents = iter(lambda: fd.read(blockSize), b'')
            m = reduce(upd, contents, md5())

        args['checksum'] = 'md5:%s' % m.hexdigest()
        # args['checksum'] = 'md5:%s' % md5.new(filename).read()).hexdigest()
        args['size'] = os.path.getsize(filename)

        msg = "HTTP Upload is about to start:\n"
        msg += " => URL: %s\n" % self.step.upload.URL
        msg += " => Filename: %s\n" % filename
        logging.info(msg)

        try:
            logException = True
            for uploadURL in self.step.upload.URL.split(';'):
                (headers, data) = self.upload(uploadURL, args, filename)
                msg = '  Status code: %s\n' % headers.get(
                    "Dqm-Status-Code", None)
                msg += '  Message: %s\n' % headers.get("Dqm-Status-Message",
                                                       None)
                msg += '  Detail: %s\n' % headers.get("Dqm-Status-Detail",
                                                      None)
                msg += '  Data: %s\n' % str(data)
                if int(headers.get("Dqm-Status-Code", 400)) >= 400:
                    logException = False
                    raise Exception(msg)
                else:
                    msg = 'HTTP upload finished succesfully with response:\n' + msg
                    logging.info(msg)
        except HTTPError as ex:
            msg = 'HTTP upload failed with response:\n'
            msg += '  Status code: %s\n' % ex.hdrs.get("Dqm-Status-Code", None)
            msg += '  Message: %s\n' % ex.hdrs.get("Dqm-Status-Message", None)
            msg += '  Detail: %s\n' % ex.hdrs.get("Dqm-Status-Detail", None)
            msg += '  Error: %s\n' % str(ex)
            logging.exception(msg)
            raise WMExecutionFailure(70318, "DQMUploadFailure", msg)
        except Exception as ex:
            msg = 'HTTP upload failed! Error:\n%s' % str(ex)
            if logException:
                logging.exception(msg)
            else:
                logging.error(msg)
            raise WMExecutionFailure(70318, "DQMUploadFailure", msg)

        return
Beispiel #16
0
    def execute(self, emulator=None):
        """
        _execute_


        """
        stepModule = "WMTaskSpace.%s" % self.stepName
        if emulator is not None:
            return emulator.emulate(self.step, self.job)

        # write the wrapper script to a temporary location
        # I don't pass it directly through os.system because I don't
        # trust that there won't be shell-escape shenanigans with
        # arbitrary input files
        scramSetup = self.step.application.setup.softwareEnvironment
        scramCommand = self.step.application.setup.scramCommand
        scramProject = self.step.application.setup.scramProject
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion
        jobReportXML = self.step.output.jobReport
        cmsswCommand = self.step.application.command.executable
        cmsswConfig = self.step.application.command.configuration
        cmsswArguments = self.step.application.command.arguments
        userTarball = ','.join(self.step.user.inputSandboxes)
        userFiles = ','.join(self.step.user.userFiles)
        logging.info('User files are %s', userFiles)
        logging.info('User sandboxes are %s', userTarball)

        scramArch = getSingleScramArch(scramArch)

        multicoreSettings = self.step.application.multicore
        try:
            logging.info("CMSSW configured for %s cores and %s event streams",
                         multicoreSettings.numberOfCores,
                         multicoreSettings.eventStreams)
        except AttributeError:
            logging.info(
                "No value set for multicore numberOfCores or eventStreams")

        logging.info("Executing CMSSW step")

        #
        # set any global environment variables
        #
        try:
            os.environ['FRONTIER_ID'] = 'wmagent_%s' % (
                self.report.data.workload)
        except Exception as ex:
            logging.error('Have critical error in setting FRONTIER_ID: %s',
                          str(ex))
            logging.error(
                'Continuing, as this is not a critical function yet.')

        #
        # scram bootstrap
        #
        scram = Scram(
            command=scramCommand,
            version=cmsswVersion,
            initialise=self.step.application.setup.softwareEnvironment,
            directory=self.step.builder.workingDir,
            architecture=scramArch,
        )

        logging.info("Runing SCRAM")
        try:
            projectOutcome = scram.project()
        except Exception as ex:
            msg = "Exception raised while running scram.\n"
            msg += str(ex)
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        if projectOutcome > 0:
            msg = scram.diagnostic()
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
        runtimeOutcome = scram.runtime()
        if runtimeOutcome > 0:
            msg = scram.diagnostic()
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        #
        # pre scripts
        #
        logging.info("Running PRE scripts")
        for script in self.step.runtime.preScripts:
            # TODO: Exception handling and error handling & logging
            scriptProcess = subprocess.Popen(
                ["/bin/bash"],
                shell=True,
                cwd=self.step.builder.workingDir,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                stdin=subprocess.PIPE,
            )
            # BADPYTHON
            scriptProcess.stdin.write(
                "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH\n")
            invokeCommand = "%s -m WMCore.WMRuntime.ScriptInvoke %s %s \n" % (
                sys.executable, stepModule, script)
            logging.info("    Invoking command: %s", invokeCommand)
            scriptProcess.stdin.write(invokeCommand)
            stdout, stderr = scriptProcess.communicate()
            retCode = scriptProcess.returncode
            if retCode > 0:
                msg = "Error running command\n%s\n" % invokeCommand
                msg += "%s\n %s\n %s\n" % (retCode, stdout, stderr)
                logging.critical("Error running command")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "PreScriptFailure", msg)

        #
        # pre scripts with scram
        #
        logging.info("RUNNING SCRAM SCRIPTS")
        for script in self.step.runtime.scramPreScripts:
            # invoke scripts with scram()
            runtimeDir = getattr(self.step.runtime, 'scramPreDir', None)
            invokeCommand = self.step.runtime.invokeCommand if hasattr(self.step.runtime, 'invokeCommand') else \
                "%s -m WMCore.WMRuntime.ScriptInvoke %s" % (sys.executable, stepModule)
            invokeCommand += " %s \n" % script
            retCode = scram(invokeCommand, runtimeDir=runtimeDir)
            if retCode > 0:
                msg = "Error running command\n%s\n" % invokeCommand
                msg += "%s\n " % retCode
                msg += scram.diagnostic()
                logging.critical(msg)
                raise WMExecutionFailure(50513, "PreScriptScramFailure", msg)

        configPath = "%s/%s-main.sh" % (self.step.builder.workingDir,
                                        self.stepName)
        handle = open(configPath, 'w')
        handle.write(CONFIG_BLOB)
        handle.close()

        # spawn this new process
        # the script looks for:
        # <SCRAM_COMMAND> <SCRAM_PROJECT> <CMSSW_VERSION> <JOB_REPORT> <EXECUTABLE> <CONFIG>
        # open the output files
        stdoutHandle = open(self.step.output.stdout, 'w')
        stderrHandle = open(self.step.output.stderr, 'w')
        args = [
            '/bin/bash', configPath, scramSetup, scramArch, scramCommand,
            scramProject, cmsswVersion, jobReportXML, cmsswCommand,
            cmsswConfig, userTarball, userFiles, cmsswArguments
        ]
        logging.info("Executing CMSSW. args: %s", args)

        # possibly needed environment overrides for CMSSW call go here
        envOverride = {}
        # work around problem with GSI authentication plugin and EOS at CERN
        if socket.getfqdn().endswith("cern.ch"):
            envOverride['XRD_LOADBALANCERTTL'] = "86400"
        # some libraries linked with CMSSW need HOME in the environment
        if 'HOME' not in os.environ:
            envOverride['HOME'] = os.environ.get('PWD', "/")

        os.environ.update(envOverride)

        returncode = subprocess.call(args,
                                     stdout=stdoutHandle,
                                     stderr=stderrHandle)

        self.setCondorChirpAttrDelayed('Chirp_WMCore_cmsRun_ExitCode',
                                       returncode)
        self.setCondorChirpAttrDelayed(
            'Chirp_WMCore_%s_ExitCode' % self.stepName, returncode)

        stdoutHandle.close()
        stderrHandle.close()

        self.step.execution.exitStatus = returncode
        argsDump = {'arguments': args}

        if returncode != 0:
            msg = "Error running cmsRun\n%s\n" % argsDump
            msg += "Return code: %s\n" % returncode
            logging.critical(msg)
            raise WMExecutionFailure(returncode, "CmsRunFailure", msg)

        try:
            self.report.parse(jobReportXML, stepName=self.stepName)
        except Exception as ex:
            # Catch it if something goes wrong
            raise WMExecutionFailure(50115, "BadJobReportXML", str(ex))

        stepHelper = WMStepHelper(self.step)
        typeHelper = stepHelper.getTypeHelper()

        acquisitionEra = self.task.getAcquisitionEra()
        processingVer = self.task.getProcessingVersion()
        processingStr = self.task.getProcessingString()
        validStatus = self.workload.getValidStatus()
        inputPath = self.task.getInputDatasetPath()
        globalTag = typeHelper.getGlobalTag()
        prepID = self.task.getPrepID()
        campaign = self.workload.getCampaign()
        cacheUrl, cacheDB, configID = stepHelper.getConfigInfo()

        self.report.setValidStatus(validStatus=validStatus)
        self.report.setGlobalTag(globalTag=globalTag)
        self.report.setCampaign(campaign)
        self.report.setPrepID(prepID)
        self.report.setInputDataset(inputPath=inputPath)
        self.report.setAcquisitionProcessing(acquisitionEra=acquisitionEra,
                                             processingVer=processingVer,
                                             processingStr=processingStr)
        self.report.setConfigURL(configURL="%s;;%s;;%s" %
                                 (cacheUrl, cacheDB, configID))

        # Attach info to files
        self.report.addInfoToOutputFilesForStep(stepName=self.stepName,
                                                step=self.step)

        self.report.checkForOutputFiles(stepName=self.stepName)
        self.report.checkForAdlerChecksum(stepName=self.stepName)
        self.report.checkForRunLumiInformation(stepName=self.stepName)

        if self.step.output.keep != True:
            self.report.killOutput()
        else:
            # Check that we only keep the desired output
            for module in stepHelper.getIgnoredOutputModules():
                self.report.deleteOutputModuleForStep(stepName=self.stepName,
                                                      moduleName=module)

        # Add stageout LFN to existing TFileService files
        reportAnalysisFiles = self.report.getAnalysisFilesFromStep(
            self.stepName)
        for reportAnalysisFile in reportAnalysisFiles:
            newLFN = analysisFileLFN(reportAnalysisFile.fileName,
                                     self.step.user.lfnBase, self.job)
            addAttributesToFile(reportAnalysisFile,
                                pfn=reportAnalysisFile.fileName,
                                lfn=newLFN,
                                validate=False)

        # Add analysis file entries for additional files listed in workflow
        for fileName in stepHelper.listAnalysisFiles():
            analysisFile = stepHelper.getAnalysisFile(fileName)
            if os.path.isfile(analysisFile.fileName):
                newLFN = analysisFileLFN(analysisFile.fileName,
                                         analysisFile.lfnBase, self.job)
                self.report.addAnalysisFile(analysisFile.fileName,
                                            lfn=newLFN,
                                            Source='UserDefined',
                                            pfn=os.path.join(
                                                os.getcwd(),
                                                analysisFile.fileName),
                                            validate=False)

        return
Beispiel #17
0
    def execute(self, emulator = None, **overrides):
        """
        _execute_


        """
        # Are we using emulators again?
        if (emulator != None):
            return emulator.emulate( self.step, self.job )

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Find alternate stageout location
        self.altLFN = overrides.get('altLFN', None)

        logging.info("Beginning Steps.Executors.LogArchive.Execute")
        logging.info("Using the following overrides: %s " % overrides)
        logging.info("Step is: %s" % self.step)
        # Wait timout for stageOut
        waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        matchFiles = [
            ".log$",
            "FrameworkJobReport",
            "Report.pkl",
            "Report.pcl",
            "^PSet.py$",
            "^PSet.pkl$"
            ]

        #Okay, we need a stageOut Manager
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            (overrides.has_key('newStageOut') and overrides.get('newStageOut')):
            useNewStageOutCode = True
        if not useNewStageOutCode:
            # old style
            manager = StageOutMgr.StageOutMgr(**overrides)
            manager.numberOfRetries = self.step.retryCount
            manager.retryPauseTime  = self.step.retryDelay
        else:
            # new style
            logging.info("LOGARCHIVE IS USING NEW STAGEOUT CODE")
            manager = WMCore.Storage.FileManager.StageOutMgr(
                                retryPauseTime  = self.step.retryDelay,
                                numberOfRetries = self.step.retryCount,
                                **overrides)

        #Now we need to find all the reports
        logFilesForTransfer = []
        #Look in the taskSpace first
        logFilesForTransfer.extend(self.findFilesInDirectory(self.stepSpace.taskSpace.location, matchFiles))

        #What if it's empty?
        if len(logFilesForTransfer) == 0:
            msg = "Could find no log files in job"
            logging.error(msg)
            return logFilesForTransfer

        #Now that we've gone through all the steps, we have to tar it out
        tarName         = 'logArchive.tar.gz'
        tarBallLocation = os.path.join(self.stepSpace.location, tarName)
        tarBall         = tarfile.open(tarBallLocation, 'w:gz')
        for f in logFilesForTransfer:
            tarBall.add(name  = f,
                        arcname = f.replace(self.stepSpace.taskSpace.location, '', 1).lstrip('/'))
        tarBall.close()


        fileInfo = {'LFN': self.getLFN(tarName),
            'PFN' : tarBallLocation,
            'SEName' : None,
            'GUID' : None
            }

        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            manager(fileInfo)
            self.report.addOutputModule(moduleName = "logArchive")
            reportFile = {"lfn": fileInfo["LFN"], "pfn": fileInfo["PFN"],
                          "location": fileInfo["SEName"], "module_label": "logArchive",
                          "events": 0, "size": 0, "merged": False,
                          "checksums": {'md5': BasicAlgos.getMD5(tarBallLocation),
                                        'adler32': readAdler32(tarBallLocation),
                                        'cksum': readCksum(tarBallLocation)}}
            self.report.addOutputFile(outputModule = "logArchive", file = reportFile)
        except Alarm:
            msg = "Indefinite hang during stageOut of logArchive"
            logging.error(msg)
            self.report.addError(self.stepName, 60404, "LogArchiveTimeout", msg)
            self.report.persist("Report.pkl")
            raise WMExecutionFailure(60404, "LogArchiveTimeout", msg)
        except WMException, ex:
            self.report.addError(self.stepName, 60307, "LogArchiveFailure", str(ex))
            self.report.setStepStatus(self.stepName, 0)
            self.report.persist("Report.pkl")
            raise ex
Beispiel #18
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        # Are we using emulators again?
        if emulator is not None:
            return emulator.emulate(self.step, self.job)

        logging.info("Steps.Executors.%s.execute called", self.__class__.__name__)

        scramCommand = self.step.application.setup.scramCommand
        cmsswVersion = self.step.application.setup.cmsswVersion
        scramArch = getSingleScramArch(self.step.application.setup.scramArch)
        overrideCatalog = getattr(self.step.application, 'overrideCatalog', None)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        # hardcode CERN EOS T2_CH_CERN stageout parameters
        eosStageOutParams = {}
        eosStageOutParams['command'] = overrides.get('command', "xrdcp")
        eosStageOutParams['option'] = overrides.get('option', "--wma-disablewriterecovery")
        eosStageOutParams['phedex-node'] = overrides.get('phedex-node', "T2_CH_CERN")
        eosStageOutParams['lfn-prefix'] = overrides.get('lfn-prefix', "root://eoscms.cern.ch//eos/cms")

        try:
            eosStageOutMgr = StageOutMgr(**eosStageOutParams)
            stageInMgr = StageInMgr()
            deleteMgr = DeleteMgr()
        except Exception as ex:
            msg = "Unable to load StageOut/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)

        # prepare output tar file
        taskName = self.report.getTaskName().split('/')[-1]
        host = socket.gethostname().split('.')[0]
        tarName = '%s-%s-%s-%i-logs.tar' % (self.report.data.workload, taskName, host, self.job["counter"])
        tarLocation = os.path.join(self.stepSpace.location, tarName)

        # Supported by any release beyond CMSSW_8_X, however DaviX is broken until CMSSW_10_4_X
        # see: https://github.com/cms-sw/cmssw/issues/25292
        useEdmCopyUtil = True
        if isCMSSWSupported(cmsswVersion, "CMSSW_10_4_0"):
            pass
        elif scramArch.startswith('slc7_amd64_'):
            msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead"
            logging.warning(msg)
            cmsswVersion = "CMSSW_10_4_0"
            scramArch = "slc7_amd64_gcc820"
        elif scramArch.startswith('slc6_amd64_'):
            msg = "CMSSW too old or not fully functional to support edmCopyUtil, using CMSSW_10_4_0 instead"
            logging.warning(msg)
            cmsswVersion = "CMSSW_10_4_0"
            scramArch = "slc6_amd64_gcc700"
        else:
            useEdmCopyUtil = False
        logging.info("Using edmCopyUtil: %s", useEdmCopyUtil)

        # setup Scram needed to run edmCopyUtil
        if useEdmCopyUtil:
            scram = Scram(
                    command=scramCommand,
                    version=cmsswVersion,
                    initialise=self.step.application.setup.softwareEnvironment,
                    directory=self.step.builder.workingDir,
                    architecture=scramArch,
            )
            logging.info("Running scram")
            try:
                projectOutcome = scram.project()
            except Exception as ex:
                msg = "Exception raised while running scram.\n"
                msg += str(ex)
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

            if projectOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
            runtimeOutcome = scram.runtime()
            if runtimeOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        # iterate through input files
        localLogs = []
        deleteLogArchives = []
        if useEdmCopyUtil:
            numberOfFilesPerCopy = 10
        else:
            numberOfFilesPerCopy = 1
        for logs in grouper(self.job["input_files"], numberOfFilesPerCopy):

            copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get('X509_USER_PROXY', None)

            # specify TFC if necessary
            if overrideCatalog:
                copyCommand += " -c %s" % overrideCatalog

            for log in logs:
                copyCommand += " %s" % log['lfn']
            copyCommand += " %s" % self.step.builder.workingDir

            # give up after timeout of 1 minute per input file
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60 * numberOfFilesPerCopy)

            filesCopied = False
            try:
                if useEdmCopyUtil:
                    logging.info("Running edmCopyUtil")
                    retval = scram(copyCommand)
                    if retval == 0:
                        filesCopied = True
                else:
                    logging.info("Running stageIn")
                    for log in logs:
                        fileInfo = {"LFN": log['lfn']}
                        logArchive = stageInMgr(**fileInfo)
                        if logArchive:
                            filesCopied = True
            except Alarm:
                logging.error("Indefinite hang during edmCopyUtil/stageIn of logArchives")
            except StageOutFailure:
                logging.error("Unable to stageIn logArchives")
            except Exception:
                raise

            signal.alarm(0)

            if filesCopied:
                for log in logs:
                    localLogs.append(os.path.join(self.step.builder.workingDir, os.path.basename(log['lfn'])))
                    deleteLogArchives.append(log)
                    self.report.addInputFile(sourceName="logArchives", lfn=log['lfn'])
            else:
                logging.error("Unable to copy logArchives to local disk")
                for log in logs:
                    self.report.addSkippedFile(log['lfn'], None)

        # create tarfile if any logArchive copied in
        if localLogs:
            with tarfile.open(tarLocation, 'w:') as tarFile:
                for log in localLogs:
                    path = log.split('/')
                    tarFile.add(name=log,
                                arcname=os.path.join(path[-3], path[-2], path[-1]))
                    os.remove(log)
        else:
            msg = "Unable to copy any logArchives to local disk"
            logging.error(msg)
            raise WMExecutionFailure(60312, "LogCollectError", msg)

        # now staging out the LogCollect tarfile
        logging.info("Staging out LogCollect tarfile to EOS (skipping CASTOR)")
        now = datetime.datetime.now()
        lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent",
                                                     self.report.data.workload,
                                                     os.path.basename(tarLocation))

        tarInfo = {'LFN': lfn,
                   'PFN': tarLocation,
                   'PNN': None,
                   'GUID': None}

        # perform mandatory stage out to CERN EOS
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            eosStageOutMgr(tarInfo)
        except Alarm:
            msg = "Indefinite hang during stageOut of LogCollect to EOS"
            logging.error(msg)
            raise WMExecutionFailure(60409, "LogCollectTimeout", msg)
        except Exception as ex:
            msg = "Unable to stageOut LogCollect to Castor:\n"
            msg += str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60408, "LogCollectStageOutError", msg)
        signal.alarm(0)

        # add to job report
        self.report.addOutputFile(outputModule="LogCollect", aFile=tarInfo)
        outputRef = getattr(self.report.data, self.stepName)
        outputRef.output.pfn = tarInfo['PFN']
        outputRef.output.location = tarInfo['PNN']
        outputRef.output.lfn = tarInfo['LFN']

        # we got this far, delete ALL input files assigned to this job
        for log in self.job["input_files"]:
            # give up after timeout of 1 minutes
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60)
            try:
                fileToDelete = {'LFN': log['lfn'],
                                'PFN': None,
                                'PNN': None,
                                'StageOutCommand': None}
                deleteMgr(fileToDelete=fileToDelete)
            except Alarm:
                logging.error("Indefinite hang during delete of logArchive")
            except Exception as ex:
                logging.error("Unable to delete logArchive: %s", ex)
            signal.alarm(0)

        return
Beispiel #19
0
class MulticoreCMSSW(Executor):
    """
    _MulticoreCMSWW_

    Execute a CMSSW Step in a terrifying series of parallel universes!!!!1!1!

    """
    def pre(self, emulator=None):
        """
        _pre_

        Pre execution checks

        """
        if (emulator != None):
            return emulator.emulatePre(self.step)
        logging.info("Pre-executing CMSSW step")
        if hasattr(self.step.application.configuration, 'configCacheUrl'):
            # means we have a configuration & tweak in the sandbox
            psetFile = self.step.application.command.configuration
            psetTweak = self.step.application.command.psetTweak
            self.stepSpace.getFromSandbox(psetFile)

            if psetTweak:
                self.stepSpace.getFromSandbox(psetTweak)

        #Todo: Discover number of cores available to the job and set the number of
        # cores.

        self.step.runtime.scramPreScripts.append("SetupCMSSWMulticore")
        self.step.runtime.scramPreScripts.append("SetupCMSSWPset")
        return None

    def execute(self, emulator=None):
        """
        _execute_


        """
        stepModule = "WMTaskSpace.%s" % self.stepName
        if (emulator != None):
            return emulator.emulate(self.step, self.job)

        # write the wrapper script to a temporary location
        # I don't pass it directly through os.system because I don't
        # trust that there won't be shell-escape shenanigans with
        # arbitrary input files
        scramSetup = self.step.application.setup.softwareEnvironment
        scramCommand = self.step.application.setup.scramCommand
        scramProject = self.step.application.setup.scramProject
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion
        jobReportXML = self.step.output.jobReport
        cmsswCommand = self.step.application.command.executable
        cmsswConfig = self.step.application.command.configuration
        cmsswArguments = self.step.application.command.arguments

        logging.info("Executing MulticoreCMSSW step")

        multicoreSettings = self.step.application.multicore
        numberOfCores = multicoreSettings.numberOfCores
        logging.info("Multicore configured for %s cores" % numberOfCores)

        #Todo:  Create input file list from job writing filelist one per line to
        filelist = open(multicoreSettings.inputfilelist, 'w')
        for inputFile in self.job['input_files']:
            filelist.write("%s\n" % inputFile['lfn'])
        filelist.close()

        #
        # scram bootstrap
        #
        scram = Scram(
            command=scramCommand,
            version=cmsswVersion,
            initialise=self.step.application.setup.softwareEnvironment,
            directory=self.step.builder.workingDir,
            architecture=scramArch,
        )

        logging.info("Running SCRAM")
        try:
            projectOutcome = scram.project()
        except Exception, ex:
            msg = "Exception raised while running scram.\n"
            msg += str(ex)
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(60513, "ScramSetupFailure", msg)

        if projectOutcome > 0:
            msg = scram.diagnostic()
            #self.report.addError(60513, "ScramSetupFailure", msg)
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(60513, "ScramSetupFailure", msg)
        runtimeOutcome = scram.runtime()
        if runtimeOutcome > 0:
            msg = scram.diagnostic()
            #self.report.addError(60513, "ScramSetupFailure", msg)
            logging.critical("Error running SCRAM")
            logging.critical(msg)
            raise WMExecutionFailure(60513, "ScramSetupFailure", msg)

        #
        # pre scripts
        #
        logging.info("Running PRE scripts")
        for script in self.step.runtime.preScripts:
            # TODO: Exception handling and error handling & logging
            scriptProcess = subprocess.Popen(
                ["/bin/bash"],
                shell=True,
                cwd=self.step.builder.workingDir,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                stdin=subprocess.PIPE,
            )
            # BADPYTHON
            scriptProcess.stdin.write(
                "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$VO_CMS_SW_DIR/COMP/slc5_amd64_gcc434/external/openssl/0.9.7m/lib:$VO_CMS_SW_DIR/COMP/slc5_amd64_gcc434/external/bz2lib/1.0.5/lib\n"
            )
            invokeCommand = "%s -m WMCore.WMRuntime.ScriptInvoke %s %s \n" % (
                sys.executable, stepModule, script)
            logging.info("    Invoking command: %s" % invokeCommand)
            scriptProcess.stdin.write(invokeCommand)
            stdout, stderr = scriptProcess.communicate()
            retCode = scriptProcess.returncode
            if retCode > 0:
                msg = "Error running command\n%s\n" % invokeCommand
                msg += "%s\n %s\n %s\n" % (retCode, stdout, stderr)
                logging.critical("Error running command")
                logging.critical(msg)
                raise WMExecutionFailure(60514, "PreScriptFailure", msg)

        #
        # pre scripts with scram
        #
        logging.info("RUNNING SCRAM SCRIPTS")
        for script in self.step.runtime.scramPreScripts:
            "invoke scripts with scram()"
            invokeCommand = "%s -m WMCore.WMRuntime.ScriptInvoke %s %s \n" % (
                sys.executable, stepModule, script)
            logging.info("    Invoking command: %s" % invokeCommand)
            retCode = scram(invokeCommand)
            if retCode > 0:
                msg = "Error running command\n%s\n" % invokeCommand
                msg += "%s\n " % retCode
                msg += scram.diagnostic()
                logging.critical(msg)
                raise WMExecutionFailure(60515, "PreScriptScramFailure", msg)

        configPath = "%s/%s-main.sh" % (self.step.builder.workingDir,
                                        self.stepName)
        handle = open(configPath, 'w')
        handle.write(configBlob)
        handle.close()
        # spawn this new process
        # the script looks for:
        # <SCRAM_COMMAND> <SCRAM_PROJECT> <CMSSW_VERSION> <JOB_REPORT> <EXECUTABLE>
        #    <CONFIG>
        # open the output files
        stdoutHandle = open(self.step.output.stdout, 'w')
        stderrHandle = open(self.step.output.stderr, 'w')
        applicationStart = time.time()
        args = [
            '/bin/bash', configPath, scramSetup, scramArch, scramCommand,
            scramProject, cmsswVersion, jobReportXML, cmsswCommand,
            cmsswConfig, cmsswArguments
        ]
        logging.info("Executing CMSSW. args: %s" % args)
        spawnedChild = subprocess.Popen(args, 0, None, None, stdoutHandle,
                                        stderrHandle)

        #(stdoutData, stderrData) = spawnedChild.communicate()
        # the above line replaces the bottom block. I'm unsure of why
        # nobody used communicate(), but I'm leaving this just in case
        # AMM Jul 4th, /2010
        # loop and collect the data
        while True:
            (rdready, wrready, errready) = select.select(
                [stdoutHandle.fileno(),
                 stderrHandle.fileno()], [], [])
            # see if the process is still running
            spawnedChild.poll()
            if (spawnedChild.returncode != None):
                break
            # give the process some time to fill a buffer
            select.select([], [], [], .1)

        spawnedChild.wait()
        stdoutHandle.close()
        stderrHandle.close()

        self.step.execution.exitStatus = spawnedChild.returncode
        argsDump = {'arguments': args}

        if spawnedChild.returncode != 0:
            msg = "Error running cmsRun\n%s\n" % argsDump
            msg += "Return code: %s\n" % spawnedChild.returncode
            logging.critical(msg)
            raise WMExecutionFailure(spawnedChild.returncode, "CmsRunFailure",
                                     msg)

        #  //
        # // Now go through the reports and merge the outputs
        #//
        outputReports = readMultiJobReports(self.step.output.jobReport,
                                            self.stepName,
                                            self.step.builder.workingDir)
        self.report = None
        mergeBuckets = {}
        inputFiles = {}
        aggregator = ReportUtils.Aggregator()
        #  //
        # // loop over the sub reports and combine the inputs for recording and outputs for merges
        #//
        for o in outputReports:
            # use one of the reports as the template for the master report
            if self.report == None:
                self.report = o
            aggregator.add(o.report.performance)
            # store list of all unique input files into the job, adding them together if they
            # are fragments of the same file
            for inp in o.getAllInputFiles():
                lfn = inp['lfn']
                if not inputFiles.has_key(lfn):
                    inputFiles[lfn] = inp
                else:
                    addFiles(inputFiles[lfn], inp)
            #  //
            # // process the output modules from each subreport
            #//
            for f in o.getAllFiles():
                lfn = f['outputModule']
                if not mergeBuckets.has_key(lfn):
                    mergeBuckets[lfn] = MergeBucket(
                        f['lfn'], f['outputModule'], self.stepName,
                        self.step.builder.workingDir)
                mergeBuckets[lfn].append(f)

        # clean up the master report from the template
        reportData = getattr(self.report.data, self.stepName)
        for src in reportData.input.listSections_():
            delattr(reportData.input, src)
        for omod in reportData.outputModules:
            delattr(reportData.output, omod)
            reportData.outputModules.remove(omod)

        #  //
        # // Add the reduced list of input files to the master report
        #//
        for f in inputFiles.values():
            self.report.addInputFile(f['module_label'], **f)

        #  //
        # // Now roll through the merges, run the merge and edit the master job report with the outcome
        #//
        mergesStart = time.time()
        mergeTiming = []
        for b in mergeBuckets.values():
            # write the merge config file
            thisMergeStarts = time.time()
            b.writeConfig()
            # run the merge as a scram enabled command
            logging.info("    Invoking command: %s" % b.merge_command)
            logfile = "%s.log" % b.merge_pset_file
            retCode = scram(b.merge_command, False, logfile,
                            self.step.builder.workingDir)
            if retCode > 0:
                msg = "Error running merge job:\n%s\n" % b.merge_command
                msg += "Merge Config:\n%s\n" % b.mergeConfig()
                msg += "%s\n " % retCode
                msg += scram.diagnostic()
                logging.critical(msg)
                raise WMExecutionFailure(60666, "MulticoreMergeFailure", msg)
            #  //
            # // add report from merge to master
            #//  ToDo: try/except here in case report is missing
            b.editReport(self.report)
            thisMergeTime = time.time() - thisMergeStarts
            mergeTiming.append(thisMergeTime)
        mergesComplete = time.time()
        totalJobTime = mergesComplete - applicationStart
        self.report.report.performance = aggregator.aggregate()
        ReportUtils.updateMulticoreReport(self.report, len(mergeBuckets),
                                          mergesStart, mergesComplete,
                                          totalJobTime, *mergeTiming)

        return
Beispiel #20
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        scramCommand = self.step.application.setup.scramCommand
        scramArch = self.step.application.setup.scramArch
        cmsswVersion = self.step.application.setup.cmsswVersion

        # Are we using emulators again?
        if (emulator != None):
            return emulator.emulate(self.step, self.job)

        overrides = {}
        if hasattr(self.step, 'override'):
            overrides = self.step.override.dictionary_()

        # Set wait to over an hour
        waitTime = overrides.get(
            'waitTime', 3600 + (self.step.retryDelay * self.step.retryCount))

        # hardcode CERN Castor T0_CH_CERN_MSS stageout parameters
        castorStageOutParams = {}
        castorStageOutParams['command'] = overrides.get('command', "xrdcp")
        castorStageOutParams['option'] = overrides.get('option',
                                                       "--cerncastor")
        castorStageOutParams['phedex-node'] = overrides.get(
            'phedex-node', "T2_CH_CERN")
        castorStageOutParams['lfn-prefix'] = overrides.get(
            'lfn-prefix', "root://castorcms.cern.ch//castor/cern.ch/cms")

        # hardcode CERN EOS T2_CH_CERN stageout parameters
        eosStageOutParams = {}
        eosStageOutParams['command'] = overrides.get('command', "xrdcp")
        eosStageOutParams['option'] = overrides.get('option', "")
        eosStageOutParams['phedex-node'] = overrides.get(
            'phedex-node', "T2_CH_CERN")
        eosStageOutParams['lfn-prefix'] = overrides.get(
            'lfn-prefix', "root://eoscms.cern.ch//eos/cms")

        # are we using the new stageout method ?
        useNewStageOutCode = False
        if getattr(self.step, 'newStageout', False) or \
            ('newStageOut' in overrides and overrides.get('newStageOut')):
            useNewStageOutCode = True

        try:
            castorStageOutMgr = StageOutMgr(**castorStageOutParams)
            eosStageOutMgr = StageOutMgr(**eosStageOutParams)
            stageInMgr = StageInMgr()
            deleteMgr = DeleteMgr()
        except Exception as ex:
            msg = "Unable to load StageOut/Delete Impl: %s" % str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60312, "MgrImplementationError", msg)

        # prepare output tar file
        taskName = self.report.getTaskName().split('/')[-1]
        host = socket.gethostname().split('.')[0]
        tarName = '%s-%s-%s-%i-logs.tar' % (
            self.report.data.workload, taskName, host, self.job["counter"])
        tarLocation = os.path.join(self.stepSpace.location, tarName)

        # check if the cmsswVersion supports edmCopyUtil (min CMSSW_8_X)
        result = re.match("CMSSW_([0-9]+)_([0-9]+)_([0-9]+).*", cmsswVersion)
        useEdmCopyUtil = False
        if result:
            try:
                if int(result.group(1)) >= 8:
                    useEdmCopyUtil = True
            except ValueError:
                pass

        # setup Scram needed to run edmCopyUtil
        if useEdmCopyUtil:
            scram = Scram(
                command=scramCommand,
                version=cmsswVersion,
                initialise=self.step.application.setup.softwareEnvironment,
                directory=self.step.builder.workingDir,
                architecture=scramArch,
            )
            logging.info("Running scram")
            try:
                projectOutcome = scram.project()
            except Exception as ex:
                msg = "Exception raised while running scram.\n"
                msg += str(ex)
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

            if projectOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)
            runtimeOutcome = scram.runtime()
            if runtimeOutcome > 0:
                msg = scram.diagnostic()
                logging.critical("Error running SCRAM")
                logging.critical(msg)
                raise WMExecutionFailure(50513, "ScramSetupFailure", msg)

        # iterate through input files
        localLogs = []
        deleteLogArchives = []
        if useEdmCopyUtil:
            numberOfFilesPerCopy = 10
        else:
            numberOfFilesPerCopy = 1
        for logs in grouper(self.job["input_files"], numberOfFilesPerCopy):

            copyCommand = "env X509_USER_PROXY=%s edmCopyUtil" % os.environ.get(
                'X509_USER_PROXY', None)
            for log in logs:
                copyCommand += " %s" % log['lfn']
            copyCommand += " %s" % self.step.builder.workingDir

            # give up after timeout of 1 minute per input file
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60 * numberOfFilesPerCopy)

            filesCopied = False
            try:
                if useEdmCopyUtil:
                    logging.info("Running edmCopyUtil")
                    retval = scram(copyCommand)
                    if retval == 0:
                        filesCopied = True
                else:
                    logging.info("Running stageIn")
                    for log in logs:
                        fileInfo = {"LFN": log['lfn']}
                        logArchive = stageInMgr(**fileInfo)
                        if logArchive:
                            filesCopied = True
            except Alarm:
                logging.error(
                    "Indefinite hang during edmCopyUtil/stageIn of logArchives"
                )
            except StageOutFailure:
                logging.error("Unable to stageIn logArchives")
            except Exception:
                raise

            signal.alarm(0)

            if filesCopied:
                for log in logs:
                    localLogs.append(
                        os.path.join(self.step.builder.workingDir,
                                     os.path.basename(log['lfn'])))
                    deleteLogArchives.append(log)
                    self.report.addInputFile(sourceName="logArchives",
                                             lfn=log['lfn'])
            else:
                logging.error("Unable to copy logArchives to local disk")
                if useEdmCopyUtil:
                    with open('scramOutput.log', 'r') as f:
                        logging.error("Scram output: %s", f.read())
                for log in logs:
                    self.report.addSkippedFile(log['lfn'], None)

        # create tarfile if any logArchive copied in
        if localLogs:
            tarFile = tarfile.open(tarLocation, 'w:')
            for log in localLogs:
                path = log.split('/')
                tarFile.add(name=log,
                            arcname=os.path.join(path[-3], path[-2], path[-1]))
                os.remove(log)
            tarFile.close()
        else:
            msg = "Unable to copy any logArchives to local disk"
            logging.error(msg)
            raise WMExecutionFailure(60312, "LogCollectError", msg)

        # now staging out the LogCollect tarfile
        logging.info("Staging out LogCollect tarfile to Castor and EOS")
        now = datetime.datetime.now()
        lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (
            now.year, now.month, "WMAgent", self.report.data.workload,
            os.path.basename(tarLocation))

        tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None}

        # perform mandatory stage out to CERN Castor
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            castorStageOutMgr(tarInfo)
        except Alarm:
            msg = "Indefinite hang during stageOut of LogCollect to Castor"
            logging.error(msg)
            raise WMExecutionFailure(60409, "LogCollectTimeout", msg)
        except Exception as ex:
            msg = "Unable to stageOut LogCollect to Castor:\n"
            msg += str(ex)
            logging.error(msg)
            raise WMExecutionFailure(60408, "LogCollectStageOutError", msg)
        signal.alarm(0)

        # add to job report
        self.report.addOutputFile(outputModule="LogCollect", file=tarInfo)
        outputRef = getattr(self.report.data, self.stepName)
        outputRef.output.pfn = tarInfo['PFN']
        outputRef.output.location = tarInfo['PNN']
        outputRef.output.lfn = tarInfo['LFN']

        tarInfo = {'LFN': lfn, 'PFN': tarLocation, 'PNN': None, 'GUID': None}

        # then, perform best effort stage out to CERN EOS
        signal.signal(signal.SIGALRM, alarmHandler)
        signal.alarm(waitTime)
        try:
            eosStageOutMgr(tarInfo)
        except Alarm:
            logging.error(
                "Indefinite hang during stageOut of LogCollect to EOS")
        except Exception as ex:
            logging.error("Unable to stageOut LogCollect to EOS:\n", ex)
        signal.alarm(0)

        # we got this far, delete input
        for log in deleteLogArchives:

            # give up after timeout of 1 minutes
            signal.signal(signal.SIGALRM, alarmHandler)
            signal.alarm(60)
            try:
                fileToDelete = {
                    'LFN': log['lfn'],
                    'PFN': None,
                    'PNN': None,
                    'StageOutCommand': None
                }
                deleteMgr(fileToDelete=fileToDelete)
            except Alarm:
                logging.error("Indefinite hang during delete of logArchive")
            except Exception as ex:
                logging.error("Unable to delete logArchive: %s", ex)
            signal.alarm(0)

        return
Beispiel #21
0
                except StageOutFailure, ex:
                    msg = "Unable to StageIn %s" % file['LFN']
                    logging.error(msg)
                    # Don't do anything other then record it
                    self.report.addSkippedFile(file.get('PFN', None), file['LFN'])
                except Exception, ex:
                    raise

                signal.alarm(0)

            if len(readyFiles) == 0:
                # Then we have no output; all the files failed
                # Panic!
                msg = "No logs staged in during LogCollect step"
                logging.error(msg)
                raise WMExecutionFailure(60312, "LogCollectError", msg)

            now = datetime.datetime.now()
            tarPFN = self.createArchive(readyFiles)
            if userLogs:
                lfn = "%s/%s/logs/%s" % (lfnBase, self.report.data.workload, os.path.basename(tarPFN))
            else:
                lfn = "/store/logs/prod/%i/%.2i/%s/%s/%s" % (now.year, now.month, "WMAgent",
                                                            self.report.data.workload,
                                                            os.path.basename(tarPFN))

            tarInfo = {'LFN'    : lfn,
                    'PFN'    : tarPFN,
                    'SEName' : None,
                    'GUID'   : None}