def TransferFiles(job_state, datadir, files, **kwargs): """ Transfers files from list 'files' May change CWD with pUtil.chdir (several times) :param job_state: :param datadir: job data dir :param files: list of filenames :param kwargs: specific arguments for other purposes :return: """ job = job_state.job pUtil.chdir(datadir) XMLMetadata = pUtil.getMetadata(job_state.site.workdir, job.jobId) thisSite = DorE(kwargs, 'thisSite') if not setGuids(job_state, files, **kwargs): job.result[2] = PilotErrors().ERR_LOSTJOBPFC return ReturnCode.FailedJob outPFC = updateOutPFC(job, **kwargs) if not outPFC: return ReturnCode.FailedJob dsname = defaultDSname(job.destinationDblock) datasetDict = pUtil.getDatasetDict(job.outFiles, job.destinationDblock, job.logFile, job.logDblock) if not datasetDict: log("Output files will go to default dataset: %s" % (dsname)) # the cmtconfig is needed by at least the xrdcp site mover cmtconfig = pUtil.getCmtconfig(job.cmtconfig) tin_0 = os.times() rf = None _state = ReturnCode.OK _msg = "" ec = -1 try: # Note: alt stage-out numbers are not saved in recovery mode (job object not returned from this function) rc, pilotErrorDiag, rf, rs, job.filesNormalStageOut, job.filesAltStageOut, os_bucket_id = Mover.mover_put_data( "xmlcatalog_file:%s" % outPFC, dsname, thisSite.sitename, thisSite.computingElement, analysisJob=pUtil.isAnalysisJob(job.trf.split(",")[0]), proxycheck=DorE(kwargs, 'proxycheckFlag'), pinitdir=DorE(kwargs, 'pilot_initdir'), datasetDict=datasetDict, stageoutTries=DorE(kwargs, 'stageoutretry'), cmtconfig=cmtconfig, recoveryWorkDir=thisSite.workdir, job=job) except Exception, e: pilotErrorDiag = "Put function can not be called for staging out: %s" % str(e) log("!!%s!!1105!! %s" % (env['errorLabel'], pilotErrorDiag)) ec = PilotErrors().ERR_PUTFUNCNOCALL _state = ReturnCode.Holding _msg = env['errorLabel']
# send the original xml/json if it exists (end of production job, ignore for event service job) filenamePayloadMetadata = self.getPayloadMetadataFilename(site.workdir, job.jobId, altloc=job.workdir) payloadXMLProblem = False # backward compatibility try: eventService = job.eventService except: eventService = False if not eventService: if os.path.exists(filenamePayloadMetadata) and final: # get the metadata created by the payload payloadXML = getMetadata(site.workdir, job.jobId, athena=True, altpath=filenamePayloadMetadata) # add the metadata to the node if payloadXML != "" and payloadXML != None: tolog("Adding payload metadata of size %d to node dictionary (\'metaData\' field):\n%s" % (len(payloadXML), payloadXML)) node['metaData'] = payloadXML else: pilotErrorDiag = "Empty Athena metadata in file: %s" % (filenamePayloadMetadata) payloadXMLProblem = True else: # athena XML should exist at the end of the job if job.result[0] == 'finished' and 'Install' not in site.sitename and 'ANALY' not in site.sitename and 'DDM' not in site.sitename and 'test' not in site.sitename and job.prodSourceLabel != "install" and not eventService: pilotErrorDiag = "Metadata does not exist: %s" % (filenamePayloadMetadata) payloadXMLProblem = True else: tolog("Will not send payload metadata for event service job")
def DeferredStageoutJob(job_dir, job_state_file="", deferred_stageout_logfile=False, **kwargs): """ Performs stageing out preparation and stages out the job in specified directory. :param job_dir: (string) directory with a job. mandatory parameter :param job_state_file: (string) path to job state file or other file containing job state. If empty, job state file is located as job_dir+'/jobState-*.*'. defaults to "" :param deferred_stageout_logfile: (string|False) template name for deferred log stageout Replaces "{job_id}" with current job id like "log-{job_id}.txt" -> "log-124124.txt" Default False Other parameters are passed into other functions :return: (bool) the fact of stageout being performed """ log('Deferred stageout from job directory "%s"' % job_dir) job_state = JobState() if job_state_file == "": try: job_state_file = glob(job_dir + "/" + jobState_file_wildcart)[0] except: log("There is no job state file in the provided directory, exiting") return False log("Job state file is %s" % job_state_file) # lockfd, lockfn = createAtomicLockFile(job_dir) with LockFileWrapper(job_dir): if not TestJobDirForDeferredStageoutNecessity(job_dir, job_state_file, **kwargs): log('Job "%s" does not need deferred stageout procedure (yet)' % job_dir) # releaseAtomicLockFile(lockfd, lockfn) return False if not job_state.get(job_state_file): log("Job state file reading failed, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False log('Working with job in "%s"' % job_dir) _job, _site, _node, _recoveryAttempt = job_state.decode() if not (_job and _site and _node): log("Can not decode jobState file, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger: rc = PrepareJobForDeferredStageout(job_state, **kwargs) if rc == ReturnCode.PostJobOnly: pUtil.postJobTask( job_state.job, job_state.site, DorE(kwargs, "workerNode"), DorE(kwargs, "experiment"), jr=True, ra=job_state.recoveryAttempt, ) # releaseAtomicLockFile(lockfd, lockfn) return True if rc > 0: log("Job is not prepared for stageout, exiting") if rc == ReturnCode.Cleanup: cleanup(job_state) # releaseAtomicLockFile(lockfd, lockfn) return False rc, logfile, datadir, filelist = CreateTransferFileList(job_state, **kwargs) XMLStr = "" if datadir == "": try: XMLStr = job_state.node["xml"] except: pass if XMLStr == "": XMLStr = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId) currentdir = os.getcwd() pUtil.chdir(job_state.site.workdir) if len(filelist): log("Stageout will now transfer the files") rc = TransferFiles(job_state, datadir, filelist, **kwargs) if rc == ReturnCode.Holding: job_state.job.result[0] = "holding" if rc == ReturnCode.FailedJob: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) pUtil.chdir(job_state.site.workdir) ret = True if logfile != "" and not pUtil.isLogfileCopied(job_state.site.workdir): log("Stageout will now transfer the log") _log = JobLog() ret, _ = _log.transferLogFile( job_state.job, job_state.site, DorE(kwargs, "experiment"), dest=None, jr=True ) if not ret: rc = ReturnCode.Holding # We need to transfer log file regardless the files if rc == ReturnCode.OK: if pUtil.verifyTransfer(job_state.site.workdir): job_state.job.result[0] = "finished" else: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) if job_state.job.result[0] in finalJobStates: job_state.job.final_state = job_state.job.result[0] log("Stageout will now update the server with new status") rt, retNode = updatePandaServer(job_state, xmlstr=XMLStr, **kwargs) if rt == 0: log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2])) # did the server send back a command? if "tobekilled" in job_state.job.action: log("!!WARNING!!1120!! Panda server returned a 'tobekilled' command") job_state.job.result[0] = "failed" # further recovery attempt unnecessary, but keep the work dir for debugging if job_state.job.result[0] == "failed": log("Further recovery attempts will be prevented for failed job (will leave work dir)") if not job_state.rename(job_state.site, job_state.job): log("(Fate of job state file left for next pilot)") else: log("!!WARNING!!1120!! Panda server returned a %d" % (rt)) # store the final state so that the next pilot will know # store the metadata xml retNode["xml"] = XMLStr # update the job state file with the new state information _retjs = pUtil.updateJobState(job_state.job, job_state.site, retNode, job_state.recoveryAttempt) log("Stageout will now proceed to post-job actions") if job_state.job.result[0] in finalJobStates: pUtil.postJobTask( job_state.job, job_state.site, DorE(kwargs, "workerNode"), DorE(kwargs, "experiment"), jr=True, ra=job_state.recoveryAttempt, ) pUtil.chdir(currentdir) # releaseAtomicLockFile(lockfd, lockfn) if job_state.job.result[0] == "finished": log("Stageout will now remove the job, it is in finished state and can be removed") cleanup(job_state) return True
if ( (not os.path.isfile(logfile)) and os.path.isdir(job_state.job.newDirNM) and os.path.isfile(os.path.join(job_state.job.newDirNM, pilotLogFileInNewWD)) ): ec = pUtil.getExitCode(job_state.job.newDirNM, "pilotlog.txt") # Too big nesting, but I don't know how to do better. # We search an error code, if not found, we get it from the server if ec == -1: job_state.job.setState(["failed", 0, PilotErrors().ERR_LOSTJOBNOTFINISHED]) log("Exit code not found") # get the metadata # this metadata does not contain the metadata for the log strXML = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId) # update the server rt, retNode = updatePandaServer(job_state, xmlstr=strXML, **kwargs) if rt == 0: return lognret( ReturnCode.Cleanup, "Lost job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2]), ) else: log("!!WARNING!!1130!! Panda server returned a %d" % (rt)) # store the final state so that the next pilot will know # store the metadata xml retNode["xml"] = strXML
def __mk_gl_temp_dir(self): """Make the directory tree for glexec. See usage example in: http://wiki.nikhef.nl/grid/GLExec_TransientPilotJobs """ pUtil.tolog('sys path is %s' % sys.path) pUtil.tolog("folder is : %s" % self.__mkgltempdir_path) cmd = '%s -t 777 `pwd`' % self.__mkgltempdir_path attempts = 0 while attempts < 3: stdout, stderr, status = execute(cmd) pUtil.tolog('cmd: %s' % cmd) pUtil.tolog('output: %s' % stdout) pUtil.tolog('error: %s' % stderr) pUtil.tolog('status: %s' % status) if not (status or stderr): self.__target_path = stdout.rstrip('\n') os.environ['GLEXEC_TARGET_DIR'] = self.__target_path os.environ['GLEXEC_TARGET_PROXY'] = os.path.join(self.__target_path, 'user_proxy') pUtil.tolog("gltmpdir created and added to env: %s" % self.__target_path) pUtil.tolog("now adding sandbox to sys.path") sys.path.append(self.__target_path) pUtil.tolog("New sys.path is %s " %sys.path) return 0 else: pUtil.tolog('error! gltmpdir has failed') attempts += 1 #raise GlexecException("mkgltempdir failed: %s" % stderr) pUtil.tolog("mkgltempdir failed: %s" % stderr) if attempts == 3: pUtil.tolog('sys path is %s' % sys.path) pUtil.tolog('os environ is %s' % os.environ) ec = 1226 env = Configuration.Configuration() pUtil.tolog("Updating PanDA server for the failed job (error code %d)" % (ec)) env['job'].result[0] = 'failed' env['job'].currentState = env['job'].result[0] env['job'].result[2] = ec env['pilotErrorDiag'] = "gLExec related failure - %s" %stderr env['job'].pilotErrorDiag = env['pilotErrorDiag'] from pilot import getProperNodeName if 'https://' not in env['pshttpurl']: env['pshttpurl'] = 'https://' + env['pshttpurl'] import Node#, Site env['workerNode'] = Node.Node() env['workerNode'].setNodeName(getProperNodeName(os.uname()[1])) env['job'].workdir = os.getcwd() env['thisSite'].workdir = os.getcwd() from PandaServerClient import PandaServerClient strXML = pUtil.getMetadata(env['thisSite'].workdir, env['job'].jobId) client = PandaServerClient(pilot_version = env['version'], pilot_version_tag = env['pilot_version_tag'], pilot_initdir = env['pilot_initdir'], jobSchedulerId = env['jobSchedulerId'], pilotId = env['pilotId'], updateServer = env['updateServerFlag'], jobrec = env['jobrec'], pshttpurl = env['pshttpurl']) client.updatePandaServer(env['job'], env['thisSite'], env['workerNode'], env['psport'], log = env['pilotErrorDiag'], useCoPilot = env['useCoPilot'], xmlstr = strXML) return 1 else: pUtil.tolog('[Trial %s] Sleeping for 10 secs and retrying' % attempts) time.sleep(10)
# send the original xml/json if it exists (end of production job, ignore for event service job) filenamePayloadMetadata = self.getPayloadMetadataFilename(site.workdir, job.jobId, altloc=job.workdir) payloadXMLProblem = False # backward compatibility try: eventService = job.eventService except: eventService = False if not eventService: if os.path.exists(filenamePayloadMetadata) and final: # get the metadata created by the payload payloadXML = getMetadata(site.workdir, job.jobId, athena=True, altpath=filenamePayloadMetadata) # add the metadata to the node if payloadXML != "" and payloadXML != None: tolog("Adding payload metadata of size %d to node dictionary (\'metaData\' field):\n%s" % (len(payloadXML), payloadXML)) node['metaData'] = payloadXML else: pilotErrorDiag = "Empty Athena metadata in file: %s" % (filenamePayloadMetadata) payloadXMLProblem = True else: # athena XML should exist at the end of the job pass # not actual for COMPASS and others # if job.result[0] == 'finished' and 'Install' not in site.sitename and 'ANALY' not in site.sitename and 'DDM' not in site.sitename and 'test' not in site.sitename and job.prodSourceLabel != "install" and not eventService: # pilotErrorDiag = "Metadata does not exist: %s" % (filenamePayloadMetadata) # payloadXMLProblem = True
def DeferredStageoutJob(job_dir, job_state_file="", deferred_stageout_logfile=False, **kwargs): """ Performs stageing out preparation and stages out the job in specified directory. :param job_dir: (string) directory with a job. mandatory parameter :param job_state_file: (string) path to job state file or other file containing job state. If empty, job state file is located as job_dir+'/jobState-*.*'. defaults to "" :param deferred_stageout_logfile: (string|False) template name for deferred log stageout Replaces "{job_id}" with current job id like "log-{job_id}.txt" -> "log-124124.txt" Default False Other parameters are passed into other functions :return: (bool) the fact of stageout being performed """ log("Deferred stageout from job directory \"%s\"" % job_dir) job_state = JobState() if job_state_file == "": try: job_state_file = glob(job_dir + "/" + jobState_file_wildcart)[0] except: log("There is no job state file in the provided directory, exiting") return False log("Job state file is %s"%job_state_file) # lockfd, lockfn = createAtomicLockFile(job_dir) with LockFileWrapper(job_dir): if not TestJobDirForDeferredStageoutNecessity(job_dir, job_state_file, **kwargs): log("Job \"%s\" does not need deferred stageout procedure (yet)" % job_dir) # releaseAtomicLockFile(lockfd, lockfn) return False if not job_state.get(job_state_file): log("Job state file reading failed, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False log("Working with job in \"%s\"" % job_dir) _job, _site, _node, _recoveryAttempt = job_state.decode() if not (_job and _site and _node): log("Can not decode jobState file, exiting") # releaseAtomicLockFile(lockfd, lockfn) return False with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger: rc = PrepareJobForDeferredStageout(job_state, **kwargs) if rc == ReturnCode.PostJobOnly: pUtil.postJobTask(job_state.job, job_state.site, DorE(kwargs, 'workerNode'), DorE(kwargs, 'experiment'), jr=True, ra=job_state.recoveryAttempt) # releaseAtomicLockFile(lockfd, lockfn) return True if rc > 0: log("Job is not prepared for stageout, exiting") if rc == ReturnCode.Cleanup: cleanup(job_state) # releaseAtomicLockFile(lockfd, lockfn) return False rc, logfile, datadir, filelist = CreateTransferFileList(job_state, **kwargs) XMLStr = '' if datadir == "": try: XMLStr = job_state.node['xml'] except: pass if XMLStr == '': XMLStr = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId) currentdir = os.getcwd() pUtil.chdir(job_state.site.workdir) if len(filelist): log("Stageout will now transfer the files") rc = TransferFiles(job_state, datadir, filelist, **kwargs) if rc == ReturnCode.Holding: job_state.job.result[0] = "holding" if rc == ReturnCode.FailedJob: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) pUtil.chdir(job_state.site.workdir) ret = True if logfile != "" and not pUtil.isLogfileCopied(job_state.site.workdir): log("Stageout will now transfer the log") _log = JobLog() ret, _ = _log.transferLogFile(job_state.job, job_state.site, DorE(kwargs, 'experiment'), dest=None, jr=True) if not ret: rc = ReturnCode.Holding # We need to transfer log file regardless the files if rc == ReturnCode.OK: if pUtil.verifyTransfer(job_state.site.workdir): job_state.job.result[0] = "finished" else: job_state.job.result[0] = "failed" job_state.job.setState(job_state.job.result) if job_state.job.result[0] in finalJobStates: job_state.job.final_state = job_state.job.result[0] log("Stageout will now update the server with new status") rt, retNode = updatePandaServer(job_state, xmlstr=XMLStr, **kwargs) if rt == 0: log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2])) # did the server send back a command? if "tobekilled" in job_state.job.action: log("!!WARNING!!1120!! Panda server returned a \'tobekilled\' command") job_state.job.result[0] = "failed" # further recovery attempt unnecessary, but keep the work dir for debugging if job_state.job.result[0] == "failed": log("Further recovery attempts will be prevented for failed job (will leave work dir)") if not job_state.rename(job_state.site, job_state.job): log("(Fate of job state file left for next pilot)") else: log("!!WARNING!!1120!! Panda server returned a %d" % (rt)) # store the final state so that the next pilot will know # store the metadata xml retNode['xml'] = XMLStr # update the job state file with the new state information _retjs = pUtil.updateJobState(job_state.job, job_state.site, retNode, job_state.recoveryAttempt) log("Stageout will now proceed to post-job actions") if job_state.job.result[0] in finalJobStates: pUtil.postJobTask(job_state.job, job_state.site, DorE(kwargs, 'workerNode'), DorE(kwargs, 'experiment'), jr=True, ra=job_state.recoveryAttempt) pUtil.chdir(currentdir) # releaseAtomicLockFile(lockfd, lockfn) if job_state.job.result[0] == "finished": log("Stageout will now remove the job, it is in finished state and can be removed") cleanup(job_state) return True
ec = job_state.job.result[2] logfile = "%s/%s" % (job_state.site.workdir, job_state.job.logFile) if not ec: if (not os.path.isfile(logfile)) and os.path.isdir(job_state.job.newDirNM)\ and os.path.isfile(os.path.join(job_state.job.newDirNM, pilotLogFileInNewWD)): ec = pUtil.getExitCode(job_state.job.newDirNM, "pilotlog.txt") # Too big nesting, but I don't know how to do better. # We search an error code, if not found, we get it from the server if ec == -1: job_state.job.setState(['failed', 0, PilotErrors().ERR_LOSTJOBNOTFINISHED]) log("Exit code not found") # get the metadata # this metadata does not contain the metadata for the log strXML = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId) # update the server rt, retNode = updatePandaServer(job_state, xmlstr=strXML, **kwargs) if rt == 0: return lognret(ReturnCode.Cleanup, "Lost job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2])) else: log("!!WARNING!!1130!! Panda server returned a %d" % (rt)) # store the final state so that the next pilot will know # store the metadata xml retNode['xml'] = strXML # update the job state file with the new state information
def updatePandaServer(self, job, site, workerNode, port, xmlstr=None, spaceReport=False, log=None, ra=0, jr=False, useCoPilot=False, stdout_tail="", additionalMetadata=None): """ Update the job status with the jobdispatcher web server. State is a tuple of (jobId, ["jobstatus", transExitCode, pilotErrorCode], timestamp) log = log extracts xmlstr is set in postJobTask for finished jobs (all files). Failed jobs will only send xml for log (created in this function) jr = job recovery mode """ tolog("Updating job status in updatePandaServer(): PandaId=%d, result=%s, time=%s" % (job.getState())) # set any holding job to failed for sites that do not use job recovery (e.g. sites with LSF, that immediately # removes any work directory after the LSF job finishes which of course makes job recovery impossible) if not self.__jobrec: if job.result[0] == 'holding' and site.sitename != "CERNVM": job.result[0] = 'failed' tolog("This site does not support job recovery: HOLDING state reset to FAILED") # note: any changed job state above will be lost for fake server updates, does it matter? # get the node structure expected by the server node = self.getNodeStructure(job, site, workerNode, spaceReport=spaceReport, log=log) # skip the server update (e.g. on NG) if not self.__updateServer: tolog("(fake server update)") return 0, node # get the xml node['xml'] = self.getXML(job, site.sitename, site.workdir, xmlstr=xmlstr, jr=jr) # stdout tail in case job.debug == 'true' if job.debug.lower() == "true" and stdout_tail != "": # protection for potentially large tails stdout_tail = stdout_tail[-2048:] node['stdout'] = stdout_tail tolog("Will send stdout tail:\n%s (length = %d)" % (stdout_tail, len(stdout_tail))) else: if job.debug.lower() != "true": tolog("Stdout tail will not be sent (debug=False)") elif stdout_tail == "": tolog("Stdout tail will not be sent (no stdout tail)") else: tolog("Stdout tail will not be sent (debug=%s, stdout_tail=\'%s\')" % (str(job.debug), stdout_tail)) # PN fake lostheartbeat # if job.result[0] == "finished": # node['state'] = "holding" # node['xml'] = "" # read back node['xml'] from jobState file for CERNVM sendXML = True if site.sitename == "CERNVM": _node = self.getNodeStructureFromFile(site.workdir, repr(job.jobId)) if _node: if _node.has_key('xml'): if _node['xml'] != "": node['xml'] = _node['xml'] tolog("Read back metadata xml from job state file (length: %d)" % len(node['xml'])) else: tolog("No metadata xml present in current job state file (1 - pilot should not send xml at this time)") sendXML = False else: tolog("No xml key in node structure") sendXML = False else: tolog("No metadata xml present in current job state file (2 - pilot should not send xml at this time)") sendXML = False # change the state to holding for initial CERNVM job if not sendXML and (job.result[0] == "finished" or job.result[0] == "failed"): # only set the holding state if the Co-Pilot is used if useCoPilot: job.result[0] = "holding" node['state'] = "holding" # update job state file _retjs = updateJobState(job, site, node, recoveryAttempt=ra) # is it the final update? if job.result[0] == 'finished' or job.result[0] == 'failed' or job.result[0] == 'holding': final = True else: final = False # send the original xml if it exists (end of production job) filenameAthenaXML = "%s/metadata-%s.xml.ATHENA" % (site.workdir, repr(job.jobId)) athenaXMLProblem = False if os.path.exists(filenameAthenaXML) and final: # get the metadata AthenaXML = getMetadata(site.workdir, job.jobId, athena=True) # add the metadata to the node if AthenaXML != "" and AthenaXML != None: tolog("Adding Athena metadata of size %d to node dictionary:\n%s" % (len(AthenaXML), AthenaXML)) node['metaData'] = AthenaXML else: pilotErrorDiag = "Empty Athena metadata in file: %s" % (filenameAthenaXML) athenaXMLProblem = True else: # athena XML should exist at the end of the job if job.result[0] == 'finished' and 'Install' not in site.sitename and 'ANALY' not in site.sitename and 'DDM' not in site.sitename and 'test' not in site.sitename: pilotErrorDiag = "Metadata does not exist: %s" % (filenameAthenaXML) athenaXMLProblem = True # fail the job if there was a problem with the athena metadata # remove the comments below if a certain trf and release should be excluded from sending metadata # trf_exclusions = ['merge_trf.py'] # release_exclusions = ['14.5.2.4'] # jobAtlasRelease = getAtlasRelease(job.atlasRelease) # if athenaXMLProblem and job.trf.split(",")[-1] not in trf_exclusions and jobAtlasRelease[-1] not in release_exclusions: if athenaXMLProblem: tolog("!!FAILED!!1300!! %s" % (pilotErrorDiag)) job.result[0] = "failed" job.result[2] = self.__error.ERR_NOATHENAMETADATA if node.has_key('pilotLog'): node['pilotLog'] += "!!FAILED!!1300!! %s" % (pilotErrorDiag) else: node['pilotLog'] = "!!FAILED!!1300!! %s" % (pilotErrorDiag) node['pilotErrorCode'] = job.result[2] node['state'] = job.result[0] # for backward compatibility try: experiment = job.experiment except: experiment = "unknown" # do not make the update if Nordugrid (leave for ARC to do) if readpar('region') == 'Nordugrid': if final: # update xml with SURLs stored in special SURL dictionary file if self.updateOutputFilesXMLWithSURLs4NG(experiment, site.workdir, job.jobId, job.outputFilesXML): tolog("Successfully added SURLs to %s" % (job.outputFilesXML)) # update xml with SURLs stored in special SURL dictionary file if node.has_key('xml'): tolog("Updating node structure XML with SURLs") node['xml'] = updateXMLWithSURLs(experiment, node['xml'], site.workdir, job.jobId, self.__jobrec) # do not use format 'NG' here else: tolog("WARNING: Found no xml entry in the node structure") # store final node structure in pilot_initdir (will be sent to server by ARC control tower) self.copyNodeStruct4NG(node) tolog("Leaving the final update for the control tower") return 0, node # do not send xml if there was a put error during the log transfer _xml = None if final and node.has_key('xml'): # update xml with SURLs stored in special SURL dictionary file tolog("Updating node structure XML with SURLs") node['xml'] = updateXMLWithSURLs(experiment, node['xml'], site.workdir, job.jobId, self.__jobrec) _xml = node['xml'] if not isLogfileCopied(site.workdir): tolog("Pilot will not send xml about output files since log was not transferred") node['xml'] = "" # should XML be sent at this time? if not sendXML: tolog("Metadata xml will not be sent") if node.has_key('xml'): if node['xml'] != "": _xml = node['xml'] node['xml'] = "" # add experiment specific metadata if final and additionalMetadata != None: tolog("Adding additionalMetadata to node") if 'metaData' in node: node['metaData'] += additionalMetadata else: node['metaData'] = additionalMetadata # make the actual update, repeatedly if necessary (for the final update) ret = makeHTTPUpdate(job.result[0], node, port, url=self.__pshttpurl, path=self.__pilot_initdir) if not ret[2]: # data is None for a failed update attempt tolog("makeHTTPUpdate returned: %s" % str(ret)) return 1, None tolog("ret = %s" % str(ret)) data = ret[1] tolog("data = %s" % str(data)) if data.has_key("command"): job.action = data['command'] try: awk = data['StatusCode'] except: tolog("!!WARNING!!1300!! Having problem updating job status, set the awk to 1 for now, and continue...") awk = "1" else: tolog("jobDispatcher acknowledged with %s" % (awk)) # need to have a return code so subprocess knows if update goes ok or not ecode = int(awk) # use the awk code from jobdispatcher as the exit code # PN fake lostheartbeat # if job.result[0] == "finished": # ecode = 1 # reset xml in case it was overwritten above for failed log transfers if final and node.has_key('xml'): node['xml'] = _xml return ecode, node # ecode=0 : update OK, otherwise something wrong