コード例 #1
0
ファイル: JobRecovery.py プロジェクト: vokac/pilot
    def isSameType(self, trf, userflag):
        """ is the lost job of same type as the current pilot? """

        # treat userflag 'self' as 'user'
        if userflag == 'self':
            userflag = 'user'

        if (isAnalysisJob(trf) and userflag == 'user') or \
               (not isAnalysisJob(trf) and userflag != 'user'):
            sametype = True
            if userflag == 'user':
                tolog(
                    "Lost job is of same type as current pilot (analysis pilot, lost analysis job trf: %s)"
                    % (trf))
            else:
                tolog(
                    "Lost job is of same type as current pilot (production pilot, lost production job trf: %s)"
                    % (trf))
        else:
            sametype = False
            if userflag == 'user':
                tolog(
                    "Lost job is not of same type as current pilot (analysis pilot, lost production job trf: %s)"
                    % (trf))
            else:
                tolog(
                    "Lost job is not of same type as current pilot (production pilot, lost analysis job trf: %s)"
                    % (trf))

        return sametype
コード例 #2
0
ファイル: DeferredStageout.py プロジェクト: vokac/pilot
def TransferFiles(job_state, datadir, files, **kwargs):
    """
    Transfers files from list 'files'

    May change CWD with pUtil.chdir (several times)

    :param job_state:
    :param datadir: job data dir
    :param files: list of filenames
    :param kwargs: specific arguments for other purposes
    :return:
    """
    job = job_state.job

    pUtil.chdir(datadir)

    XMLMetadata = pUtil.getMetadata(job_state.site.workdir, job.jobId)
    thisSite = DorE(kwargs, 'thisSite')

    if not setGuids(job_state, files, **kwargs):
        job.result[2] = PilotErrors().ERR_LOSTJOBPFC
        return ReturnCode.FailedJob

    outPFC = updateOutPFC(job, **kwargs)
    if not outPFC:
        return ReturnCode.FailedJob

    dsname = defaultDSname(job.destinationDblock)

    datasetDict = pUtil.getDatasetDict(job.outFiles, job.destinationDblock, job.logFile, job.logDblock)
    if not datasetDict:
        log("Output files will go to default dataset: %s" % (dsname))

    # the cmtconfig is needed by at least the xrdcp site mover
    cmtconfig = pUtil.getCmtconfig(job.cmtconfig)

    tin_0 = os.times()
    rf = None
    _state = ReturnCode.OK
    _msg = ""
    ec = -1
    try:
        # Note: alt stage-out numbers are not saved in recovery mode (job object not returned from this function)
        rc, pilotErrorDiag, rf, rs, job.filesNormalStageOut, job.filesAltStageOut, os_bucket_id = Mover.mover_put_data(
            "xmlcatalog_file:%s" % outPFC, dsname,
            thisSite.sitename, thisSite.computingElement, analysisJob=pUtil.isAnalysisJob(job.trf.split(",")[0]),
            proxycheck=DorE(kwargs, 'proxycheckFlag'),
            pinitdir=DorE(kwargs, 'pilot_initdir'),
            datasetDict=datasetDict,
            stageoutTries=DorE(kwargs, 'stageoutretry'), 
            cmtconfig=cmtconfig, recoveryWorkDir=thisSite.workdir,
            job=job)
    except Exception, e:
        pilotErrorDiag = "Put function can not be called for staging out: %s" % str(e)
        log("!!%s!!1105!! %s" % (env['errorLabel'], pilotErrorDiag))
        ec = PilotErrors().ERR_PUTFUNCNOCALL
        _state = ReturnCode.Holding
        _msg = env['errorLabel']
コード例 #3
0
    def setParameters(self, *args, **kwargs):
        """ Set any internally needed variables """

        # set initial values
        self.__job = kwargs.get("job", None)
        if self.__job:
            self.__analysisJob = isAnalysisJob(self.__job.trf)
        else:
            self.__warning = "setParameters found no job object"
コード例 #4
0
    def setParameters(self, *args, **kwargs):
        """ Set any internally needed variables """

        # set initial values
        self.__job = kwargs.get('job', None)
        if self.__job:
            self.__analysisJob = isAnalysisJob(self.__job.trf)
        else:
            self.__warning = "setParameters found no job object"
コード例 #5
0
ファイル: DeferredStageout.py プロジェクト: vokac/pilot
def updateOutPFC(job, **kwargs):
    file_name = "OutPutFileCatalog.xml"
    file_path = os.path.join(DorE(kwargs, 'thisSite').workdir, file_name)
    try:
        guids_status = pUtil.PFCxml(job.experiment, file_path, job.outFiles, fguids=job.outFilesGuids, fntag="pfn",
                                    analJob=pUtil.isAnalysisJob(job.trf.split(",")[0]), jr=True)
    except Exception, e:
        log("!!FAILED!!1105!! Exception caught (Could not generate xml for the remaining output files): %s" %
                    str(e))
        job.result[2] = PilotErrors().ERR_LOSTJOBXML
        return False
コード例 #6
0
ファイル: JobRecovery.py プロジェクト: PanDAWMS/pilot
    def isSameType(self, trf, userflag):
        """ is the lost job of same type as the current pilot? """

        # treat userflag 'self' as 'user'
        if userflag == 'self':
            userflag = 'user'

        if (isAnalysisJob(trf) and userflag == 'user') or \
               (not isAnalysisJob(trf) and userflag != 'user'):
            sametype = True
            if userflag == 'user':
                tolog("Lost job is of same type as current pilot (analysis pilot, lost analysis job trf: %s)" % (trf))
            else:
                tolog("Lost job is of same type as current pilot (production pilot, lost production job trf: %s)" % (trf))
        else:
            sametype = False
            if userflag == 'user':
                tolog("Lost job is not of same type as current pilot (analysis pilot, lost production job trf: %s)" % (trf))
            else:
                tolog("Lost job is not of same type as current pilot (production pilot, lost analysis job trf: %s)" % (trf))

        return sametype
コード例 #7
0
ファイル: RunJobEdison.py プロジェクト: mlassnig/pilot
                runJob.setGlobalErrorCode(error.ERR_SIGUSR1)
            else:
                runJob.setGlobalErrorCode(error.ERR_KILLSIGNAL)
            runJob.setFailureCode(runJob.getGlobalErrorCode)
            # print to stderr
            print >> sys.stderr, runJob.getGlobalPilotErrorDiag()
            raise SystemError(sig)

        signal.signal(signal.SIGTERM, sig2exc)
        signal.signal(signal.SIGQUIT, sig2exc)
        signal.signal(signal.SIGSEGV, sig2exc)
        signal.signal(signal.SIGXCPU, sig2exc)
        signal.signal(signal.SIGBUS, sig2exc)

        # see if it's an analysis job or not
        analysisJob = isAnalysisJob(job.trf.split(",")[0])
        if analysisJob:
            tolog("User analysis job")
        else:
            tolog("Production job")
        tolog("runJob received a job with prodSourceLabel=%s" % (job.prodSourceLabel))

        # setup starts here ................................................................................

        # update the job state file
        job.jobState = "setup"
        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # send [especially] the process group back to the pilot
        job.setState([job.jobState, 0, 0])
        rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort())
コード例 #8
0
                runJob.setGlobalErrorCode(error.ERR_SIGUSR1)
            else:
                runJob.setGlobalErrorCode(error.ERR_KILLSIGNAL)
            runJob.setFailureCode(runJob.getGlobalErrorCode)
            # print to stderr
            print >> sys.stderr, runJob.getGlobalPilotErrorDiag()
            raise SystemError(sig)

        signal.signal(signal.SIGTERM, sig2exc)
        signal.signal(signal.SIGQUIT, sig2exc)
        signal.signal(signal.SIGSEGV, sig2exc)
        signal.signal(signal.SIGXCPU, sig2exc)
        signal.signal(signal.SIGBUS, sig2exc)

        # see if it's an analysis job or not
        analysisJob = isAnalysisJob(job.trf.split(",")[0])
        if analysisJob:
            tolog("User analysis job")
        else:
            tolog("Production job")
        tolog("runJob received a job with prodSourceLabel=%s" %
              (job.prodSourceLabel))

        # setup starts here ................................................................................

        # update the job state file
        job.jobState = "setup"
        #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test")

        # send [especially] the process group back to the pilot
        job.setState([job.jobState, 0, 0])
コード例 #9
0
    def getJobExecutionCommand(self, job, jobSite, pilot_initdir):
        """ Define and test the command(s) that will be used to execute the payload """

        # Input tuple: (method is called from RunJob*)
        #   job: Job object
        #   jobSite: Site object
        #   pilot_initdir: launch directory of pilot.py
        #
        # Return tuple:
        #   pilot_error_code, pilot_error_diagnostics, job_execution_command, special_setup_command, JEM, cmtconfig
        # where
        #   pilot_error_code       : self.__error.<PILOT ERROR CODE as defined in PilotErrors class> (value should be 0 for successful setup)
        #   pilot_error_diagnostics: any output from problematic command or explanatory error diagnostics
        #   job_execution_command  : command to execute payload, e.g. cmd = "source <path>/setup.sh; <path>/python trf.py [options]"
        #   special_setup_command  : any special setup command that can be insterted into job_execution_command and is sent to stage-in/out methods
        #   JEM                    : Job Execution Monitor activation state (default value "NO", meaning JEM is not to be used. See JEMstub.py)
        #   cmtconfig              : cmtconfig symbol from the job def or schedconfig, e.g. "x86_64-slc5-gcc43-opt" [NOT USED IN THIS CLASS]

        pilotErrorDiag = ""
        cmd = ""
        special_setup_cmd = ""
        pysiteroot = ""
        siteroot = ""
        JEM = "NO"
        cmtconfig = ""

        # Is it's an analysis job or not?
        analysisJob = isAnalysisJob(job.trf)

        # Set the INDS env variable (used by runAthena)
        if analysisJob:
            self.setINDS(job.realDatasetsIn)

        # Command used to download runAthena or runGen
        wgetCommand = "wget"

        # special setup for NG
        status, pilotErrorDiag, cmd = self.setupNordugridTrf(job, analysisJob, wgetCommand, pilot_initdir)
        if status != 0:
            return status, pilotErrorDiag, "", special_setup_cmd, JEM, cmtconfig

        # add FRONTIER debugging and RUCIO env variables
        cmd = self.addEnvVars2Cmd(cmd, job.jobId, job.processingType, jobSite.sitename, analysisJob)

        if readpar("cloud") == "DE":
            # Should JEM be used?
            metaOut = {}
            try:
                import sys
                from JEMstub import updateRunCommand4JEM

                # If JEM should be used, the command will get updated by the JEMstub automatically.
                cmd = updateRunCommand4JEM(cmd, job, jobSite, tolog, metaOut=metaOut)
            except:
                # On failure, cmd stays the same
                tolog("Failed to update run command for JEM - will run unmonitored.")

            # Is JEM to be used?
            if metaOut.has_key("JEMactive"):
                JEM = metaOut["JEMactive"]

            tolog("Use JEM: %s (dictionary = %s)" % (JEM, str(metaOut)))

        elif "--enable-jem" in cmd:
            tolog("!!WARNING!!1111!! JEM can currently only be used on certain sites in DE")

        # Pipe stdout/err for payload to files
        cmd += " 1>%s 2>%s" % (job.stdout, job.stderr)
        tolog("\nCommand to run the job is: \n%s" % (cmd))

        tolog("ATLAS_PYTHON_PILOT = %s" % (os.environ["ATLAS_PYTHON_PILOT"]))

        if special_setup_cmd != "":
            tolog("Special setup command: %s" % (special_setup_cmd))

        return 0, pilotErrorDiag, cmd, special_setup_cmd, JEM, cmtconfig
コード例 #10
0
    def getJobExecutionCommandObsolete(self, job, jobSite, pilot_initdir):
        """ Define and test the command(s) that will be used to execute the payload """

        # Input tuple: (method is called from RunJob*)
        #   job: Job object
        #   jobSite: Site object
        #   pilot_initdir: launch directory of pilot.py
        #
        # Return tuple:
        #   pilot_error_code, pilot_error_diagnostics, job_execution_command, special_setup_command, JEM, cmtconfig
        # where
        #   pilot_error_code       : self.__error.<PILOT ERROR CODE as defined in PilotErrors class> (value should be 0 for successful setup)
        #   pilot_error_diagnostics: any output from problematic command or explanatory error diagnostics
        #   job_execution_command  : command to execute payload, e.g. cmd = "source <path>/setup.sh; <path>/python trf.py [options]"
        #   special_setup_command  : any special setup command that can be insterted into job_execution_command and is sent to stage-in/out methods
        #   JEM                    : Job Execution Monitor activation state (default value "NO", meaning JEM is not to be used. See JEMstub.py)
        #   cmtconfig              : cmtconfig symbol from the job def or schedconfig, e.g. "x86_64-slc5-gcc43-opt" [NOT USED IN THIS CLASS]

        pilotErrorDiag = ""
        cmd = ""
        special_setup_cmd = ""
        pysiteroot = ""
        siteroot = ""
        JEM = "NO"
        cmtconfig = ""

        # Is it's an analysis job or not?
        analysisJob = isAnalysisJob(job.trf)

        # Set the INDS env variable (used by runAthena)
        if analysisJob:
            self.setINDS(job.realDatasetsIn)

        # Command used to download runAthena or runGen
        wgetCommand = 'wget'

        # special setup for NG
        status, pilotErrorDiag, cmd = self.setupNordugridTrf(
            job, analysisJob, wgetCommand, pilot_initdir)
        if status != 0:
            return status, pilotErrorDiag, "", special_setup_cmd, JEM, cmtconfig

        # add FRONTIER debugging and RUCIO env variables
        cmd = self.addEnvVars2Cmd(cmd, job.jobId, job.taskID,
                                  job.processingType, jobSite.sitename,
                                  analysisJob)

        if readpar('cloud') == "DE":
            # Should JEM be used?
            metaOut = {}
            try:
                import sys
                from JEMstub import updateRunCommand4JEM
                # If JEM should be used, the command will get updated by the JEMstub automatically.
                cmd = updateRunCommand4JEM(cmd,
                                           job,
                                           jobSite,
                                           tolog,
                                           metaOut=metaOut)
            except:
                # On failure, cmd stays the same
                tolog(
                    "Failed to update run command for JEM - will run unmonitored."
                )

            # Is JEM to be used?
            if metaOut.has_key("JEMactive"):
                JEM = metaOut["JEMactive"]

            tolog("Use JEM: %s (dictionary = %s)" % (JEM, str(metaOut)))

        elif '--enable-jem' in cmd:
            tolog(
                "!!WARNING!!1111!! JEM can currently only be used on certain sites in DE"
            )

        # Pipe stdout/err for payload to files
        cmd += " 1>%s 2>%s" % (job.stdout, job.stderr)
        tolog("\nCommand to run the job is: \n%s" % (cmd))

        tolog("ATLAS_PYTHON_PILOT = %s" % (os.environ['ATLAS_PYTHON_PILOT']))

        if special_setup_cmd != "":
            tolog("Special setup command: %s" % (special_setup_cmd))

        return 0, pilotErrorDiag, cmd, special_setup_cmd, JEM, cmtconfig
コード例 #11
0
class RunJobHpcEvent(RunJob):

    # private data members
    __runjob = "RunJobHpcEvent"  # String defining the sub class
    __instance = None  # Boolean used by subclasses to become a Singleton

    #__error = PilotErrors()                     # PilotErrors object

    # Required methods

    def __init__(self):
        """ Default initialization """

        # e.g. self.__errorLabel = errorLabel
        pass
        self.__output_es_files = []
        self.__eventRanges = {}
        self.__failedStageOuts = []
        self._hpcManager = None

    def __new__(cls, *args, **kwargs):
        """ Override the __new__ method to make the class a singleton """

        if not cls.__instance:
            cls.__instance = super(RunJob, cls).__new__(cls, *args, **kwargs)

        return cls.__instance

    def getRunJob(self):
        """ Return a string with the experiment name """

        return self.__runjob

    def getRunJobFileName(self):
        """ Return the filename of the module """

        return super(RunJobHpcEvent, self).getRunJobFileName()

    # def argumentParser(self):  <-- see example in RunJob.py

    def allowLoopingJobKiller(self):
        """ Should the pilot search for looping jobs? """

        # The pilot has the ability to monitor the payload work directory. If there are no updated files within a certain
        # time limit, the pilot will consider the as stuck (looping) and will kill it. The looping time limits are set
        # in environment.py (see e.g. loopingLimitDefaultProd)

        return False

    def setupHPCEvent(self):
        self.__jobSite = Site.Site()
        self.__jobSite.setSiteInfo(self.argumentParser())
        ## For HPC job, we don't need to reassign the workdir
        # reassign workdir for this job
        self.__jobSite.workdir = self.__jobSite.wntmpdir
        if not os.path.exists(self.__jobSite.workdir):
            os.makedirs(self.__jobSite.workdir)

        tolog("runJobHPCEvent.getPilotLogFilename=%s" %
              self.getPilotLogFilename())
        if self.getPilotLogFilename() != "":
            pUtil.setPilotlogFilename(self.getPilotLogFilename())

        # set node info
        self.__node = Node.Node()
        self.__node.setNodeName(os.uname()[1])
        self.__node.collectWNInfo(self.__jobSite.workdir)

        # redirect stderr
        #sys.stderr = open("%s/runJobHPCEvent.stderr" % (self.__jobSite.workdir), "w")

        tolog("Current job workdir is: %s" % os.getcwd())
        tolog("Site workdir is: %s" % self.__jobSite.workdir)

        # get the experiment object
        self.__thisExperiment = getExperiment(self.getExperiment())
        tolog("runEvent will serve experiment: %s" %
              (self.__thisExperiment.getExperiment()))

    def getHPCEventJobFromPanda(self):
        pass

    def getHPCEventJobFromEnv(self):
        tolog("getHPCEventJobFromEnv")
        try:
            # always use this filename as the new jobDef module name
            import newJobDef
            job = Job.Job()
            job.setJobDef(newJobDef.job)
            job.coreCount = 0
            job.workdir = self.__jobSite.workdir
            job.experiment = self.getExperiment()
            # figure out and set payload file names
            job.setPayloadName(self.__thisExperiment.getPayloadName(job))
            # reset the default job output file list which is anyway not correct
            job.outFiles = []
        except Exception, e:
            pilotErrorDiag = "Failed to process job info: %s" % str(e)
            tolog("!!WARNING!!3000!! %s" % (pilotErrorDiag))
            self.failJob(0,
                         PilotErrors.ERR_UNKNOWN,
                         job,
                         pilotErrorDiag=pilotErrorDiag)

        self.__job = job
        # prepare for the output file data directory
        # (will only created for jobs that end up in a 'holding' state)
        self.__job.datadir = self.getParentWorkDir() + "/PandaJob_%s_data" % (
            job.jobId)

        # See if it's an analysis job or not
        trf = self.__job.trf
        self.__analysisJob = isAnalysisJob(trf.split(",")[0])

        # Setup starts here ................................................................................

        # Update the job state file
        self.__job.jobState = "starting"
        self.__job.setHpcStatus('init')

        # Send [especially] the process group back to the pilot
        self.__job.setState([self.__job.jobState, 0, 0])
        self.__job.jobState = self.__job.result
        rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(),
                                               runJob.getPilotPort())

        self.__JR = JobRecovery(pshttpurl='https://pandaserver.cern.ch',
                                pilot_initdir=self.__job.workdir)
        self.__JR.updateJobStateTest(self.__job,
                                     self.__jobSite,
                                     self.__node,
                                     mode="test")
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node,
                                    25443)

        # prepare the setup and get the run command list
        ec, runCommandList, job, multi_trf = self.setup(
            self.__job, self.__jobSite, self.__thisExperiment)
        if ec != 0:
            tolog("!!WARNING!!2999!! runJob setup failed: %s" %
                  (job.pilotErrorDiag))
            self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)
        tolog("Setup has finished successfully")
        self.__job = job
        self.__runCommandList = runCommandList
        self.__multi_trf = multi_trf

        # job has been updated, display it again
        self.__job.displayJob()
        tolog("RunCommandList: %s" % self.__runCommandList)
        tolog("Multi_trf: %s" % self.__multi_trf)