コード例 #1
0
ファイル: DeferredStageout.py プロジェクト: vokac/pilot
def TransferFiles(job_state, datadir, files, **kwargs):
    """
    Transfers files from list 'files'

    May change CWD with pUtil.chdir (several times)

    :param job_state:
    :param datadir: job data dir
    :param files: list of filenames
    :param kwargs: specific arguments for other purposes
    :return:
    """
    job = job_state.job

    pUtil.chdir(datadir)

    XMLMetadata = pUtil.getMetadata(job_state.site.workdir, job.jobId)
    thisSite = DorE(kwargs, 'thisSite')

    if not setGuids(job_state, files, **kwargs):
        job.result[2] = PilotErrors().ERR_LOSTJOBPFC
        return ReturnCode.FailedJob

    outPFC = updateOutPFC(job, **kwargs)
    if not outPFC:
        return ReturnCode.FailedJob

    dsname = defaultDSname(job.destinationDblock)

    datasetDict = pUtil.getDatasetDict(job.outFiles, job.destinationDblock, job.logFile, job.logDblock)
    if not datasetDict:
        log("Output files will go to default dataset: %s" % (dsname))

    # the cmtconfig is needed by at least the xrdcp site mover
    cmtconfig = pUtil.getCmtconfig(job.cmtconfig)

    tin_0 = os.times()
    rf = None
    _state = ReturnCode.OK
    _msg = ""
    ec = -1
    try:
        # Note: alt stage-out numbers are not saved in recovery mode (job object not returned from this function)
        rc, pilotErrorDiag, rf, rs, job.filesNormalStageOut, job.filesAltStageOut, os_bucket_id = Mover.mover_put_data(
            "xmlcatalog_file:%s" % outPFC, dsname,
            thisSite.sitename, thisSite.computingElement, analysisJob=pUtil.isAnalysisJob(job.trf.split(",")[0]),
            proxycheck=DorE(kwargs, 'proxycheckFlag'),
            pinitdir=DorE(kwargs, 'pilot_initdir'),
            datasetDict=datasetDict,
            stageoutTries=DorE(kwargs, 'stageoutretry'), 
            cmtconfig=cmtconfig, recoveryWorkDir=thisSite.workdir,
            job=job)
    except Exception, e:
        pilotErrorDiag = "Put function can not be called for staging out: %s" % str(e)
        log("!!%s!!1105!! %s" % (env['errorLabel'], pilotErrorDiag))
        ec = PilotErrors().ERR_PUTFUNCNOCALL
        _state = ReturnCode.Holding
        _msg = env['errorLabel']
コード例 #2
0
ファイル: PandaServerClient.py プロジェクト: anisyonk/pilot
        # send the original xml/json if it exists (end of production job, ignore for event service job)
        filenamePayloadMetadata = self.getPayloadMetadataFilename(site.workdir, job.jobId, altloc=job.workdir)
        payloadXMLProblem = False

        # backward compatibility
        try:
            eventService = job.eventService
        except:
            eventService = False

        if not eventService:
            if os.path.exists(filenamePayloadMetadata) and final:

                # get the metadata created by the payload
                payloadXML = getMetadata(site.workdir, job.jobId, athena=True, altpath=filenamePayloadMetadata)

                # add the metadata to the node
                if payloadXML != "" and payloadXML != None:
                    tolog("Adding payload metadata of size %d to node dictionary (\'metaData\' field):\n%s" % (len(payloadXML), payloadXML))
                    node['metaData'] = payloadXML
                else:
                    pilotErrorDiag = "Empty Athena metadata in file: %s" % (filenamePayloadMetadata)
                    payloadXMLProblem = True
            else:
                # athena XML should exist at the end of the job
                if job.result[0] == 'finished' and 'Install' not in site.sitename and 'ANALY' not in site.sitename and 'DDM' not in site.sitename and 'test' not in site.sitename and job.prodSourceLabel != "install" and not eventService:
                    pilotErrorDiag = "Metadata does not exist: %s" % (filenamePayloadMetadata)
                    payloadXMLProblem = True
        else:
            tolog("Will not send payload metadata for event service job")
コード例 #3
0
ファイル: DeferredStageout.py プロジェクト: complynx/pilot
def DeferredStageoutJob(job_dir, job_state_file="", deferred_stageout_logfile=False, **kwargs):
    """
    Performs stageing out preparation and stages out the job in specified directory.

    :param job_dir:     (string)    directory with a job.
                        mandatory parameter
    :param job_state_file:  (string)    path to job state file or other file containing job state. If empty, job
                                        state file is located as job_dir+'/jobState-*.*'.
                            defaults to ""

    :param deferred_stageout_logfile: (string|False)    template name for deferred log stageout
                                                        Replaces "{job_id}" with current job id like
                                                        "log-{job_id}.txt" -> "log-124124.txt"
                                        Default False

    Other parameters are passed into other functions

    :return: (bool) the fact of stageout being performed
    """
    log('Deferred stageout from job directory "%s"' % job_dir)

    job_state = JobState()

    if job_state_file == "":
        try:
            job_state_file = glob(job_dir + "/" + jobState_file_wildcart)[0]
        except:
            log("There is no job state file in the provided directory, exiting")
            return False

    log("Job state file is %s" % job_state_file)

    # lockfd, lockfn = createAtomicLockFile(job_dir)

    with LockFileWrapper(job_dir):
        if not TestJobDirForDeferredStageoutNecessity(job_dir, job_state_file, **kwargs):
            log('Job "%s" does not need deferred stageout procedure (yet)' % job_dir)
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        if not job_state.get(job_state_file):
            log("Job state file reading failed, exiting")
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        log('Working with job in "%s"' % job_dir)
        _job, _site, _node, _recoveryAttempt = job_state.decode()

        if not (_job and _site and _node):
            log("Can not decode jobState file, exiting")
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger:

            rc = PrepareJobForDeferredStageout(job_state, **kwargs)

            if rc == ReturnCode.PostJobOnly:
                pUtil.postJobTask(
                    job_state.job,
                    job_state.site,
                    DorE(kwargs, "workerNode"),
                    DorE(kwargs, "experiment"),
                    jr=True,
                    ra=job_state.recoveryAttempt,
                )
                # releaseAtomicLockFile(lockfd, lockfn)
                return True

            if rc > 0:
                log("Job is not prepared for stageout, exiting")
                if rc == ReturnCode.Cleanup:
                    cleanup(job_state)
                # releaseAtomicLockFile(lockfd, lockfn)
                return False

            rc, logfile, datadir, filelist = CreateTransferFileList(job_state, **kwargs)

            XMLStr = ""
            if datadir == "":
                try:
                    XMLStr = job_state.node["xml"]
                except:
                    pass

            if XMLStr == "":
                XMLStr = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId)

            currentdir = os.getcwd()
            pUtil.chdir(job_state.site.workdir)

            if len(filelist):
                log("Stageout will now transfer the files")
                rc = TransferFiles(job_state, datadir, filelist, **kwargs)

                if rc == ReturnCode.Holding:
                    job_state.job.result[0] = "holding"
                if rc == ReturnCode.FailedJob:
                    job_state.job.result[0] = "failed"

                job_state.job.setState(job_state.job.result)

            pUtil.chdir(job_state.site.workdir)
            ret = True
            if logfile != "" and not pUtil.isLogfileCopied(job_state.site.workdir):
                log("Stageout will now transfer the log")
                _log = JobLog()
                ret, _ = _log.transferLogFile(
                    job_state.job, job_state.site, DorE(kwargs, "experiment"), dest=None, jr=True
                )

            if not ret:
                rc = ReturnCode.Holding  # We need to transfer log file regardless the files

            if rc == ReturnCode.OK:
                if pUtil.verifyTransfer(job_state.site.workdir):
                    job_state.job.result[0] = "finished"
                else:
                    job_state.job.result[0] = "failed"
                job_state.job.setState(job_state.job.result)

            if job_state.job.result[0] in finalJobStates:
                job_state.job.final_state = job_state.job.result[0]

            log("Stageout will now update the server with new status")

            rt, retNode = updatePandaServer(job_state, xmlstr=XMLStr, **kwargs)

            if rt == 0:
                log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2]))

                # did the server send back a command?
                if "tobekilled" in job_state.job.action:
                    log("!!WARNING!!1120!! Panda server returned a 'tobekilled' command")
                    job_state.job.result[0] = "failed"

                # further recovery attempt unnecessary, but keep the work dir for debugging
                if job_state.job.result[0] == "failed":
                    log("Further recovery attempts will be prevented for failed job (will leave work dir)")
                    if not job_state.rename(job_state.site, job_state.job):
                        log("(Fate of job state file left for next pilot)")

            else:
                log("!!WARNING!!1120!! Panda server returned a %d" % (rt))

                # store the final state so that the next pilot will know

                # store the metadata xml
                retNode["xml"] = XMLStr

                # update the job state file with the new state information
                _retjs = pUtil.updateJobState(job_state.job, job_state.site, retNode, job_state.recoveryAttempt)

            log("Stageout will now proceed to post-job actions")

            if job_state.job.result[0] in finalJobStates:
                pUtil.postJobTask(
                    job_state.job,
                    job_state.site,
                    DorE(kwargs, "workerNode"),
                    DorE(kwargs, "experiment"),
                    jr=True,
                    ra=job_state.recoveryAttempt,
                )

            pUtil.chdir(currentdir)

            # releaseAtomicLockFile(lockfd, lockfn)

            if job_state.job.result[0] == "finished":
                log("Stageout will now remove the job, it is in finished state and can be removed")
                cleanup(job_state)

            return True
コード例 #4
0
ファイル: DeferredStageout.py プロジェクト: complynx/pilot
        if (
            (not os.path.isfile(logfile))
            and os.path.isdir(job_state.job.newDirNM)
            and os.path.isfile(os.path.join(job_state.job.newDirNM, pilotLogFileInNewWD))
        ):
            ec = pUtil.getExitCode(job_state.job.newDirNM, "pilotlog.txt")

            # Too big nesting, but I don't know how to do better.
            # We search an error code, if not found, we get it from the server
            if ec == -1:
                job_state.job.setState(["failed", 0, PilotErrors().ERR_LOSTJOBNOTFINISHED])
                log("Exit code not found")

                # get the metadata
                # this metadata does not contain the metadata for the log
                strXML = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId)

                # update the server
                rt, retNode = updatePandaServer(job_state, xmlstr=strXML, **kwargs)

                if rt == 0:
                    return lognret(
                        ReturnCode.Cleanup,
                        "Lost job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2]),
                    )
                else:
                    log("!!WARNING!!1130!! Panda server returned a %d" % (rt))

                    # store the final state so that the next pilot will know
                    # store the metadata xml
                    retNode["xml"] = strXML
コード例 #5
0
    def __mk_gl_temp_dir(self):
        """Make the directory tree for glexec.

        See usage example in:
        http://wiki.nikhef.nl/grid/GLExec_TransientPilotJobs
        """
        pUtil.tolog('sys path is %s' % sys.path)

        pUtil.tolog("folder is : %s" % self.__mkgltempdir_path)
	cmd = '%s -t 777 `pwd`' % self.__mkgltempdir_path

	attempts = 0
	while attempts < 3:
	        stdout, stderr, status = execute(cmd)
		pUtil.tolog('cmd: %s' % cmd)
	        pUtil.tolog('output: %s' % stdout)
	        pUtil.tolog('error: %s' % stderr)
        	pUtil.tolog('status: %s' % status)
	        if not (status or stderr):
			self.__target_path = stdout.rstrip('\n')
		        os.environ['GLEXEC_TARGET_DIR'] = self.__target_path
		        os.environ['GLEXEC_TARGET_PROXY'] = os.path.join(self.__target_path, 'user_proxy')
		        pUtil.tolog("gltmpdir created and added to env: %s" % self.__target_path)
			pUtil.tolog("now adding sandbox to sys.path")
			sys.path.append(self.__target_path)
			pUtil.tolog("New sys.path is %s " %sys.path)
			return 0
	        else:
			pUtil.tolog('error! gltmpdir has failed')
		        attempts += 1
			#raise GlexecException("mkgltempdir failed: %s" % stderr)
			pUtil.tolog("mkgltempdir failed: %s" % stderr)
			if attempts == 3:
	                        pUtil.tolog('sys path is %s' % sys.path)
        	                pUtil.tolog('os environ is %s' % os.environ)
				ec = 1226
				env = Configuration.Configuration()

	                        pUtil.tolog("Updating PanDA server for the failed job (error code %d)" % (ec))
	                        env['job'].result[0] = 'failed'
				env['job'].currentState = env['job'].result[0]
                	        env['job'].result[2] = ec
	                        env['pilotErrorDiag'] = "gLExec related failure - %s" %stderr
				env['job'].pilotErrorDiag = env['pilotErrorDiag']

				from pilot import getProperNodeName

				if 'https://' not in env['pshttpurl']:
					env['pshttpurl'] = 'https://' + env['pshttpurl']

				import Node#, Site
			        env['workerNode'] = Node.Node()
			        env['workerNode'].setNodeName(getProperNodeName(os.uname()[1]))
				
				env['job'].workdir = os.getcwd()
				env['thisSite'].workdir = os.getcwd()

				from PandaServerClient import PandaServerClient				

				strXML = pUtil.getMetadata(env['thisSite'].workdir, env['job'].jobId)

				client = PandaServerClient(pilot_version = env['version'], pilot_version_tag = env['pilot_version_tag'],
	                               pilot_initdir = env['pilot_initdir'], jobSchedulerId = env['jobSchedulerId'],
        	                       pilotId = env['pilotId'], updateServer = env['updateServerFlag'],
                	               jobrec = env['jobrec'], pshttpurl = env['pshttpurl'])

				client.updatePandaServer(env['job'], env['thisSite'], env['workerNode'], env['psport'],
					log = env['pilotErrorDiag'], useCoPilot = env['useCoPilot'], xmlstr = strXML)

        	                return 1

			else:
				pUtil.tolog('[Trial %s] Sleeping for 10 secs and retrying' % attempts)
				time.sleep(10)
コード例 #6
0
        # send the original xml/json if it exists (end of production job, ignore for event service job)
        filenamePayloadMetadata = self.getPayloadMetadataFilename(site.workdir, job.jobId, altloc=job.workdir)
        payloadXMLProblem = False

        # backward compatibility
        try:
            eventService = job.eventService
        except:
            eventService = False

        if not eventService:
            if os.path.exists(filenamePayloadMetadata) and final:

                # get the metadata created by the payload
                payloadXML = getMetadata(site.workdir, job.jobId, athena=True, altpath=filenamePayloadMetadata)

                # add the metadata to the node
                if payloadXML != "" and payloadXML != None:
                    tolog("Adding payload metadata of size %d to node dictionary (\'metaData\' field):\n%s" % (len(payloadXML), payloadXML))
                    node['metaData'] = payloadXML
                else:
                    pilotErrorDiag = "Empty Athena metadata in file: %s" % (filenamePayloadMetadata)
                    payloadXMLProblem = True
            else:
                # athena XML should exist at the end of the job
                pass
                # not actual for COMPASS and others
#                if job.result[0] == 'finished' and 'Install' not in site.sitename and 'ANALY' not in site.sitename and 'DDM' not in site.sitename and 'test' not in site.sitename and job.prodSourceLabel != "install" and not eventService:
#                    pilotErrorDiag = "Metadata does not exist: %s" % (filenamePayloadMetadata)
#                    payloadXMLProblem = True
コード例 #7
0
ファイル: DeferredStageout.py プロジェクト: vokac/pilot
def DeferredStageoutJob(job_dir, job_state_file="", deferred_stageout_logfile=False,
                        **kwargs):
    """
    Performs stageing out preparation and stages out the job in specified directory.

    :param job_dir:     (string)    directory with a job.
                        mandatory parameter
    :param job_state_file:  (string)    path to job state file or other file containing job state. If empty, job
                                        state file is located as job_dir+'/jobState-*.*'.
                            defaults to ""

    :param deferred_stageout_logfile: (string|False)    template name for deferred log stageout
                                                        Replaces "{job_id}" with current job id like
                                                        "log-{job_id}.txt" -> "log-124124.txt"
                                        Default False

    Other parameters are passed into other functions

    :return: (bool) the fact of stageout being performed
    """
    log("Deferred stageout from job directory \"%s\"" % job_dir)

    job_state = JobState()

    if job_state_file == "":
        try:
            job_state_file = glob(job_dir + "/" + jobState_file_wildcart)[0]
        except:
            log("There is no job state file in the provided directory, exiting")
            return False

    log("Job state file is %s"%job_state_file)

    # lockfd, lockfn = createAtomicLockFile(job_dir)

    with LockFileWrapper(job_dir):
        if not TestJobDirForDeferredStageoutNecessity(job_dir, job_state_file, **kwargs):
            log("Job \"%s\" does not need deferred stageout procedure (yet)" % job_dir)
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        if not job_state.get(job_state_file):
            log("Job state file reading failed, exiting")
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        log("Working with job in \"%s\"" % job_dir)
        _job, _site, _node, _recoveryAttempt = job_state.decode()

        if not (_job and _site and _node):
            log("Can not decode jobState file, exiting")
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger:

            rc = PrepareJobForDeferredStageout(job_state, **kwargs)

            if rc == ReturnCode.PostJobOnly:
                pUtil.postJobTask(job_state.job, job_state.site, DorE(kwargs, 'workerNode'), DorE(kwargs, 'experiment'),
                                  jr=True, ra=job_state.recoveryAttempt)
                # releaseAtomicLockFile(lockfd, lockfn)
                return True

            if rc > 0:
                log("Job is not prepared for stageout, exiting")
                if rc == ReturnCode.Cleanup:
                    cleanup(job_state)
                # releaseAtomicLockFile(lockfd, lockfn)
                return False

            rc, logfile, datadir, filelist = CreateTransferFileList(job_state, **kwargs)

            XMLStr = ''
            if datadir == "":
                try:
                    XMLStr = job_state.node['xml']
                except:
                    pass

            if XMLStr == '':
                XMLStr = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId)

            currentdir = os.getcwd()
            pUtil.chdir(job_state.site.workdir)

            if len(filelist):
                log("Stageout will now transfer the files")
                rc = TransferFiles(job_state, datadir, filelist, **kwargs)

                if rc == ReturnCode.Holding:
                    job_state.job.result[0] = "holding"
                if rc == ReturnCode.FailedJob:
                    job_state.job.result[0] = "failed"

                job_state.job.setState(job_state.job.result)

            pUtil.chdir(job_state.site.workdir)
            ret = True
            if logfile != "" and not pUtil.isLogfileCopied(job_state.site.workdir):
                log("Stageout will now transfer the log")
                _log = JobLog()
                ret, _ = _log.transferLogFile(job_state.job, job_state.site, DorE(kwargs, 'experiment'), dest=None,
                                              jr=True)

            if not ret:
                rc = ReturnCode.Holding  # We need to transfer log file regardless the files

            if rc == ReturnCode.OK:
                if pUtil.verifyTransfer(job_state.site.workdir):
                    job_state.job.result[0] = "finished"
                else:
                    job_state.job.result[0] = "failed"
                job_state.job.setState(job_state.job.result)

            if job_state.job.result[0] in finalJobStates:
                job_state.job.final_state = job_state.job.result[0]

            log("Stageout will now update the server with new status")

            rt, retNode = updatePandaServer(job_state, xmlstr=XMLStr, **kwargs)

            if rt == 0:
                log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2]))

                # did the server send back a command?
                if "tobekilled" in job_state.job.action:
                    log("!!WARNING!!1120!! Panda server returned a \'tobekilled\' command")
                    job_state.job.result[0] = "failed"

                # further recovery attempt unnecessary, but keep the work dir for debugging
                if job_state.job.result[0] == "failed":
                    log("Further recovery attempts will be prevented for failed job (will leave work dir)")
                    if not job_state.rename(job_state.site, job_state.job):
                        log("(Fate of job state file left for next pilot)")

            else:
                log("!!WARNING!!1120!! Panda server returned a %d" % (rt))

                # store the final state so that the next pilot will know

                # store the metadata xml
                retNode['xml'] = XMLStr

                # update the job state file with the new state information
                _retjs = pUtil.updateJobState(job_state.job, job_state.site, retNode, job_state.recoveryAttempt)

            log("Stageout will now proceed to post-job actions")

            if job_state.job.result[0] in finalJobStates:
                pUtil.postJobTask(job_state.job, job_state.site,
                                  DorE(kwargs, 'workerNode'), DorE(kwargs, 'experiment'), jr=True,
                                  ra=job_state.recoveryAttempt)

            pUtil.chdir(currentdir)

            # releaseAtomicLockFile(lockfd, lockfn)

            if job_state.job.result[0] == "finished":
                log("Stageout will now remove the job, it is in finished state and can be removed")
                cleanup(job_state)

            return True
コード例 #8
0
ファイル: DeferredStageout.py プロジェクト: vokac/pilot
    ec = job_state.job.result[2]
    logfile = "%s/%s" % (job_state.site.workdir, job_state.job.logFile)
    if not ec:
        if (not os.path.isfile(logfile)) and os.path.isdir(job_state.job.newDirNM)\
                and os.path.isfile(os.path.join(job_state.job.newDirNM, pilotLogFileInNewWD)):
            ec = pUtil.getExitCode(job_state.job.newDirNM, "pilotlog.txt")

            # Too big nesting, but I don't know how to do better.
            # We search an error code, if not found, we get it from the server
            if ec == -1:
                job_state.job.setState(['failed', 0, PilotErrors().ERR_LOSTJOBNOTFINISHED])
                log("Exit code not found")

                # get the metadata
                # this metadata does not contain the metadata for the log
                strXML = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId)

                # update the server
                rt, retNode = updatePandaServer(job_state, xmlstr=strXML, **kwargs)

                if rt == 0:
                    return lognret(ReturnCode.Cleanup, "Lost job %s updated (exit code %d)" % (job_state.job.jobId,
                                                                                               job_state.job.result[2]))
                else:
                    log("!!WARNING!!1130!! Panda server returned a %d" % (rt))

                    # store the final state so that the next pilot will know
                    # store the metadata xml
                    retNode['xml'] = strXML

                    # update the job state file with the new state information
コード例 #9
0
ファイル: glexec_utils.py プロジェクト: RRCKI/pilot
    def __mk_gl_temp_dir(self):
        """Make the directory tree for glexec.

        See usage example in:
        http://wiki.nikhef.nl/grid/GLExec_TransientPilotJobs
        """
        pUtil.tolog('sys path is %s' % sys.path)

        pUtil.tolog("folder is : %s" % self.__mkgltempdir_path)
	cmd = '%s -t 777 `pwd`' % self.__mkgltempdir_path

	attempts = 0
	while attempts < 3:
	        stdout, stderr, status = execute(cmd)
		pUtil.tolog('cmd: %s' % cmd)
	        pUtil.tolog('output: %s' % stdout)
	        pUtil.tolog('error: %s' % stderr)
        	pUtil.tolog('status: %s' % status)
	        if not (status or stderr):
			self.__target_path = stdout.rstrip('\n')
		        os.environ['GLEXEC_TARGET_DIR'] = self.__target_path
		        os.environ['GLEXEC_TARGET_PROXY'] = os.path.join(self.__target_path, 'user_proxy')
		        pUtil.tolog("gltmpdir created and added to env: %s" % self.__target_path)
			pUtil.tolog("now adding sandbox to sys.path")
			sys.path.append(self.__target_path)
			pUtil.tolog("New sys.path is %s " %sys.path)
			return 0
	        else:
			pUtil.tolog('error! gltmpdir has failed')
		        attempts += 1
			#raise GlexecException("mkgltempdir failed: %s" % stderr)
			pUtil.tolog("mkgltempdir failed: %s" % stderr)
			if attempts == 3:
	                        pUtil.tolog('sys path is %s' % sys.path)
        	                pUtil.tolog('os environ is %s' % os.environ)
				ec = 1226
				env = Configuration.Configuration()

	                        pUtil.tolog("Updating PanDA server for the failed job (error code %d)" % (ec))
	                        env['job'].result[0] = 'failed'
				env['job'].currentState = env['job'].result[0]
                	        env['job'].result[2] = ec
	                        env['pilotErrorDiag'] = "gLExec related failure - %s" %stderr
				env['job'].pilotErrorDiag = env['pilotErrorDiag']

				from pilot import getProperNodeName

				if 'https://' not in env['pshttpurl']:
					env['pshttpurl'] = 'https://' + env['pshttpurl']

				import Node#, Site
			        env['workerNode'] = Node.Node()
			        env['workerNode'].setNodeName(getProperNodeName(os.uname()[1]))
				
				env['job'].workdir = os.getcwd()
				env['thisSite'].workdir = os.getcwd()

				from PandaServerClient import PandaServerClient				

				strXML = pUtil.getMetadata(env['thisSite'].workdir, env['job'].jobId)

				client = PandaServerClient(pilot_version = env['version'], pilot_version_tag = env['pilot_version_tag'],
	                               pilot_initdir = env['pilot_initdir'], jobSchedulerId = env['jobSchedulerId'],
        	                       pilotId = env['pilotId'], updateServer = env['updateServerFlag'],
                	               jobrec = env['jobrec'], pshttpurl = env['pshttpurl'])

				client.updatePandaServer(env['job'], env['thisSite'], env['workerNode'], env['psport'],
					log = env['pilotErrorDiag'], useCoPilot = env['useCoPilot'], xmlstr = strXML)

        	                return 1

			else:
				pUtil.tolog('[Trial %s] Sleeping for 10 secs and retrying' % attempts)
				time.sleep(10)
コード例 #10
0
    def updatePandaServer(self, job, site, workerNode, port, xmlstr=None, spaceReport=False, log=None, ra=0, jr=False, useCoPilot=False, stdout_tail="", additionalMetadata=None):
        """
        Update the job status with the jobdispatcher web server.
        State is a tuple of (jobId, ["jobstatus", transExitCode, pilotErrorCode], timestamp)
        log = log extracts
        xmlstr is set in postJobTask for finished jobs (all files). Failed jobs will only send xml for log (created in this function)
        jr = job recovery mode
        """
    
        tolog("Updating job status in updatePandaServer(): PandaId=%d, result=%s, time=%s" % (job.getState()))

        # set any holding job to failed for sites that do not use job recovery (e.g. sites with LSF, that immediately
        # removes any work directory after the LSF job finishes which of course makes job recovery impossible)
        if not self.__jobrec:
            if job.result[0] == 'holding' and site.sitename != "CERNVM":
                job.result[0] = 'failed'
                tolog("This site does not support job recovery: HOLDING state reset to FAILED")

        # note: any changed job state above will be lost for fake server updates, does it matter?

        # get the node structure expected by the server
        node = self.getNodeStructure(job, site, workerNode, spaceReport=spaceReport, log=log)

        # skip the server update (e.g. on NG)
        if not self.__updateServer:
            tolog("(fake server update)")
            return 0, node

        # get the xml
        node['xml'] = self.getXML(job, site.sitename, site.workdir, xmlstr=xmlstr, jr=jr)

        # stdout tail in case job.debug == 'true'
        if job.debug.lower() == "true" and stdout_tail != "":
            # protection for potentially large tails
            stdout_tail = stdout_tail[-2048:]
            node['stdout'] = stdout_tail
            tolog("Will send stdout tail:\n%s (length = %d)" % (stdout_tail, len(stdout_tail)))
        else:
            if job.debug.lower() != "true":
                tolog("Stdout tail will not be sent (debug=False)")
            elif stdout_tail == "":
                tolog("Stdout tail will not be sent (no stdout tail)")
            else:
                tolog("Stdout tail will not be sent (debug=%s, stdout_tail=\'%s\')" % (str(job.debug), stdout_tail))

        # PN fake lostheartbeat
        #    if job.result[0] == "finished":
        #        node['state'] = "holding"
        #        node['xml'] = ""

        # read back node['xml'] from jobState file for CERNVM
        sendXML = True
        if site.sitename == "CERNVM":
            _node = self.getNodeStructureFromFile(site.workdir, repr(job.jobId))
            if _node:
                if _node.has_key('xml'):
                    if _node['xml'] != "":
                        node['xml'] = _node['xml']
                        tolog("Read back metadata xml from job state file (length: %d)" % len(node['xml']))
                    else:
                        tolog("No metadata xml present in current job state file (1 - pilot should not send xml at this time)")
                        sendXML = False
                else:
                    tolog("No xml key in node structure")
                    sendXML = False
            else:
                tolog("No metadata xml present in current job state file (2 - pilot should not send xml at this time)")
                sendXML = False

            # change the state to holding for initial CERNVM job
            if not sendXML and (job.result[0] == "finished" or job.result[0] == "failed"):
                # only set the holding state if the Co-Pilot is used
                if useCoPilot:
                    job.result[0] = "holding"
                    node['state'] = "holding"

        # update job state file
        _retjs = updateJobState(job, site, node, recoveryAttempt=ra)

        # is it the final update?
        if job.result[0] == 'finished' or job.result[0] == 'failed' or job.result[0] == 'holding':
            final = True
        else:
            final = False

        # send the original xml if it exists (end of production job)
        filenameAthenaXML = "%s/metadata-%s.xml.ATHENA" % (site.workdir, repr(job.jobId))
        athenaXMLProblem = False
        if os.path.exists(filenameAthenaXML) and final:

            # get the metadata
            AthenaXML = getMetadata(site.workdir, job.jobId, athena=True)

            # add the metadata to the node
            if AthenaXML != "" and AthenaXML != None:
                tolog("Adding Athena metadata of size %d to node dictionary:\n%s" % (len(AthenaXML), AthenaXML))
                node['metaData'] = AthenaXML
            else:
                pilotErrorDiag = "Empty Athena metadata in file: %s" % (filenameAthenaXML)
                athenaXMLProblem = True
        else:
            # athena XML should exist at the end of the job
            if job.result[0] == 'finished' and 'Install' not in site.sitename and 'ANALY' not in site.sitename and 'DDM' not in site.sitename and 'test' not in site.sitename:
                pilotErrorDiag = "Metadata does not exist: %s" % (filenameAthenaXML)
                athenaXMLProblem = True

        # fail the job if there was a problem with the athena metadata
        # remove the comments below if a certain trf and release should be excluded from sending metadata
        # trf_exclusions = ['merge_trf.py']
        # release_exclusions = ['14.5.2.4']
        # jobAtlasRelease = getAtlasRelease(job.atlasRelease)
        # if athenaXMLProblem and job.trf.split(",")[-1] not in trf_exclusions and jobAtlasRelease[-1] not in release_exclusions:
        if athenaXMLProblem:
            tolog("!!FAILED!!1300!! %s" % (pilotErrorDiag))
            job.result[0] = "failed"
            job.result[2] = self.__error.ERR_NOATHENAMETADATA
            if node.has_key('pilotLog'):
                node['pilotLog'] += "!!FAILED!!1300!! %s" % (pilotErrorDiag)
            else:
                node['pilotLog'] = "!!FAILED!!1300!! %s" % (pilotErrorDiag)
            node['pilotErrorCode'] = job.result[2]
            node['state'] = job.result[0]

        # for backward compatibility
        try:
            experiment = job.experiment
        except:
            experiment = "unknown"

        # do not make the update if Nordugrid (leave for ARC to do)
        if readpar('region') == 'Nordugrid':
            if final:
                # update xml with SURLs stored in special SURL dictionary file
                if self.updateOutputFilesXMLWithSURLs4NG(experiment, site.workdir, job.jobId, job.outputFilesXML):
                    tolog("Successfully added SURLs to %s" % (job.outputFilesXML))

                # update xml with SURLs stored in special SURL dictionary file
                if node.has_key('xml'):
                    tolog("Updating node structure XML with SURLs")
                    node['xml'] = updateXMLWithSURLs(experiment, node['xml'], site.workdir, job.jobId, self.__jobrec) # do not use format 'NG' here
                else:
                    tolog("WARNING: Found no xml entry in the node structure")

                # store final node structure in pilot_initdir (will be sent to server by ARC control tower)
                self.copyNodeStruct4NG(node)
                tolog("Leaving the final update for the control tower")
            return 0, node

        # do not send xml if there was a put error during the log transfer
        _xml = None
        if final and node.has_key('xml'):
            # update xml with SURLs stored in special SURL dictionary file
            tolog("Updating node structure XML with SURLs")
            node['xml'] = updateXMLWithSURLs(experiment, node['xml'], site.workdir, job.jobId, self.__jobrec)

            _xml = node['xml']
            if not isLogfileCopied(site.workdir):
                tolog("Pilot will not send xml about output files since log was not transferred")
                node['xml'] = ""

        # should XML be sent at this time?
        if not sendXML:
            tolog("Metadata xml will not be sent")
            if node.has_key('xml'):
                if node['xml'] != "":
                    _xml = node['xml']
                    node['xml'] = ""

        # add experiment specific metadata
        if final and additionalMetadata != None:
            tolog("Adding additionalMetadata to node")
            if 'metaData' in node:
                node['metaData'] += additionalMetadata
            else:
                node['metaData'] = additionalMetadata

        # make the actual update, repeatedly if necessary (for the final update)
        ret = makeHTTPUpdate(job.result[0], node, port, url=self.__pshttpurl, path=self.__pilot_initdir)
        if not ret[2]: # data is None for a failed update attempt
            tolog("makeHTTPUpdate returned: %s" % str(ret))
            return 1, None

        tolog("ret = %s" % str(ret))
        data = ret[1]
        tolog("data = %s" % str(data))

        if data.has_key("command"):
            job.action = data['command']

        try:
            awk = data['StatusCode']
        except:
            tolog("!!WARNING!!1300!! Having problem updating job status, set the awk to 1 for now, and continue...")
            awk = "1"
        else:
            tolog("jobDispatcher acknowledged with %s" % (awk))

        # need to have a return code so subprocess knows if update goes ok or not
        ecode = int(awk) # use the awk code from jobdispatcher as the exit code

        # PN fake lostheartbeat
        #    if job.result[0] == "finished":
        #        ecode = 1

        # reset xml in case it was overwritten above for failed log transfers
        if final and node.has_key('xml'):
            node['xml'] = _xml

        return ecode, node # ecode=0 : update OK, otherwise something wrong