Exemple #1
0
    def updateJobStateTest(self,
                           job,
                           thisSite,
                           workerNode,
                           recoveryAttempt=0,
                           mode=""):
        """ update the job state file """

        # NOTE: this function will eventually replace pilot::updateJobState() when new job rec is in place

        status = False

        # IGNORE TEST MODE
        mode = ""

        # create a job state object and give it the current job state information
        JS = JobState()
        if JS.put(job,
                  thisSite,
                  workerNode,
                  recoveryAttempt=recoveryAttempt,
                  mode=mode):
            if recoveryAttempt > 0:
                tolog(
                    "Successfully updated job state file (recovery attempt number: %d) with state: %s"
                    % (recoveryAttempt, job.jobState))
            else:
                tolog("Successfully updated job state file with state: %s" %
                      (job.jobState))
                status = True
        else:
            self.__pilotErrorDiag = "Failed to update job state file"
            tolog(self.__errorString % self.__pilotErrorDiag)

        return status
Exemple #2
0
    def updateJobStateTest(self, job, thisSite, workerNode, recoveryAttempt=0, mode=""):
        """ update the job state file """

        # NOTE: this function will eventually replace pilot::updateJobState() when new job rec is in place

        status = False

        # create a job state object and give it the current job state information
        JS = JobState()
        if JS.put(job, thisSite, workerNode, recoveryAttempt=recoveryAttempt, mode=mode):
            if recoveryAttempt > 0:
                tolog("Successfully updated job state file (recovery attempt number: %d) with state: %s" % (recoveryAttempt, job.jobState))
            else:
                tolog("Successfully updated job state file with state: %s" % (job.jobState))
                status = True
        else:
            self.__pilotErrorDiag = "Failed to update job state file"
            tolog(self.__errorString % self.__pilotErrorDiag)

        return status
Exemple #3
0
    def getNodeStructureFromFile(self, workDir, jobId):
        """ get the node structure from the Job State file """

        JS = JobState()
        _node = None

        # open the job state file
        tolog("workDir: %s" % (workDir))
        tolog("jobId: %s" % (jobId))
        filename = JS.getFilename(workDir, jobId)
        tolog("filename: %s" % (filename))
        if os.path.exists(filename):
            # load the objects
            if JS.get(filename):
                # decode the job state info
                _job, _site, _node, _recoveryAttempt = JS.decode()
            else:
                tolog("JS.decode() failed to load objects")
        else:
            tolog("%s does not exist" % (filename))
        return _node
    def getNodeStructureFromFile(self, workDir, jobId):
        """ get the node structure from the Job State file """

        JS = JobState()
        _node = None

        # open the job state file
        tolog("workDir: %s" % (workDir))
        tolog("jobId: %s" % (jobId))
        filename = JS.getFilename(workDir, jobId)
        tolog("filename: %s" % (filename))
        if os.path.exists(filename):
            # load the objects
            if JS.get(filename):
                # decode the job state info
                _job, _site, _node, _recoveryAttempt = JS.decode()
            else:
                tolog("JS.decode() failed to load objects")
        else:
            tolog("%s does not exist" % (filename))
        return _node
Exemple #5
0
    def performRecovery(self):
        """ Run job recovery in all directories """

        status = False

        if not self.__PASSED_INIT:
            self.__pilotErrorDiag = "Aborting job recovery due to previous failure"
            tolog(self.__errorString % self.__pilotErrorDiag)
            return status

        # keep track of the starting dir
        startingDir = os.getcwd()

        # scan all recovery dirs
        JS = JobState()
        for _dir in self.__recoveryDirs:
            status = self.run(_dir, JS)

        # return to the starting dir
        chdir(startingDir)

        return status
Exemple #6
0
def DeferredStageoutJob(job_dir, job_state_file="", deferred_stageout_logfile=False, **kwargs):
    """
    Performs stageing out preparation and stages out the job in specified directory.

    :param job_dir:     (string)    directory with a job.
                        mandatory parameter
    :param job_state_file:  (string)    path to job state file or other file containing job state. If empty, job
                                        state file is located as job_dir+'/jobState-*.*'.
                            defaults to ""

    :param deferred_stageout_logfile: (string|False)    template name for deferred log stageout
                                                        Replaces "{job_id}" with current job id like
                                                        "log-{job_id}.txt" -> "log-124124.txt"
                                        Default False

    Other parameters are passed into other functions

    :return: (bool) the fact of stageout being performed
    """
    log('Deferred stageout from job directory "%s"' % job_dir)

    job_state = JobState()

    if job_state_file == "":
        try:
            job_state_file = glob(job_dir + "/" + jobState_file_wildcart)[0]
        except:
            log("There is no job state file in the provided directory, exiting")
            return False

    log("Job state file is %s" % job_state_file)

    # lockfd, lockfn = createAtomicLockFile(job_dir)

    with LockFileWrapper(job_dir):
        if not TestJobDirForDeferredStageoutNecessity(job_dir, job_state_file, **kwargs):
            log('Job "%s" does not need deferred stageout procedure (yet)' % job_dir)
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        if not job_state.get(job_state_file):
            log("Job state file reading failed, exiting")
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        log('Working with job in "%s"' % job_dir)
        _job, _site, _node, _recoveryAttempt = job_state.decode()

        if not (_job and _site and _node):
            log("Can not decode jobState file, exiting")
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger:

            rc = PrepareJobForDeferredStageout(job_state, **kwargs)

            if rc == ReturnCode.PostJobOnly:
                pUtil.postJobTask(
                    job_state.job,
                    job_state.site,
                    DorE(kwargs, "workerNode"),
                    DorE(kwargs, "experiment"),
                    jr=True,
                    ra=job_state.recoveryAttempt,
                )
                # releaseAtomicLockFile(lockfd, lockfn)
                return True

            if rc > 0:
                log("Job is not prepared for stageout, exiting")
                if rc == ReturnCode.Cleanup:
                    cleanup(job_state)
                # releaseAtomicLockFile(lockfd, lockfn)
                return False

            rc, logfile, datadir, filelist = CreateTransferFileList(job_state, **kwargs)

            XMLStr = ""
            if datadir == "":
                try:
                    XMLStr = job_state.node["xml"]
                except:
                    pass

            if XMLStr == "":
                XMLStr = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId)

            currentdir = os.getcwd()
            pUtil.chdir(job_state.site.workdir)

            if len(filelist):
                log("Stageout will now transfer the files")
                rc = TransferFiles(job_state, datadir, filelist, **kwargs)

                if rc == ReturnCode.Holding:
                    job_state.job.result[0] = "holding"
                if rc == ReturnCode.FailedJob:
                    job_state.job.result[0] = "failed"

                job_state.job.setState(job_state.job.result)

            pUtil.chdir(job_state.site.workdir)
            ret = True
            if logfile != "" and not pUtil.isLogfileCopied(job_state.site.workdir):
                log("Stageout will now transfer the log")
                _log = JobLog()
                ret, _ = _log.transferLogFile(
                    job_state.job, job_state.site, DorE(kwargs, "experiment"), dest=None, jr=True
                )

            if not ret:
                rc = ReturnCode.Holding  # We need to transfer log file regardless the files

            if rc == ReturnCode.OK:
                if pUtil.verifyTransfer(job_state.site.workdir):
                    job_state.job.result[0] = "finished"
                else:
                    job_state.job.result[0] = "failed"
                job_state.job.setState(job_state.job.result)

            if job_state.job.result[0] in finalJobStates:
                job_state.job.final_state = job_state.job.result[0]

            log("Stageout will now update the server with new status")

            rt, retNode = updatePandaServer(job_state, xmlstr=XMLStr, **kwargs)

            if rt == 0:
                log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2]))

                # did the server send back a command?
                if "tobekilled" in job_state.job.action:
                    log("!!WARNING!!1120!! Panda server returned a 'tobekilled' command")
                    job_state.job.result[0] = "failed"

                # further recovery attempt unnecessary, but keep the work dir for debugging
                if job_state.job.result[0] == "failed":
                    log("Further recovery attempts will be prevented for failed job (will leave work dir)")
                    if not job_state.rename(job_state.site, job_state.job):
                        log("(Fate of job state file left for next pilot)")

            else:
                log("!!WARNING!!1120!! Panda server returned a %d" % (rt))

                # store the final state so that the next pilot will know

                # store the metadata xml
                retNode["xml"] = XMLStr

                # update the job state file with the new state information
                _retjs = pUtil.updateJobState(job_state.job, job_state.site, retNode, job_state.recoveryAttempt)

            log("Stageout will now proceed to post-job actions")

            if job_state.job.result[0] in finalJobStates:
                pUtil.postJobTask(
                    job_state.job,
                    job_state.site,
                    DorE(kwargs, "workerNode"),
                    DorE(kwargs, "experiment"),
                    jr=True,
                    ra=job_state.recoveryAttempt,
                )

            pUtil.chdir(currentdir)

            # releaseAtomicLockFile(lockfd, lockfn)

            if job_state.job.result[0] == "finished":
                log("Stageout will now remove the job, it is in finished state and can be removed")
                cleanup(job_state)

            return True
Exemple #7
0
def DeferredStageoutHPCJob(job_dir, deferred_stageout_logfile=False, **kwargs):
    """
    Performs stageing out preparation for the HPC job in specified directory.

    :param job_dir:     (string)    directory with a job.
                        mandatory parameter

    :param deferred_stageout_logfile: (string|False)    template name for deferred log stageout
                                                        Replaces "{job_id}" with current job id like
                                                        "log-{job_id}.txt" -> "log-124124.txt"
                                        Default False

    Other parameters are passed into other functions

    :return: (bool) the fact of stageout being performed
    """
    log('Deferred stageout from HPC job directory "%s"' % job_dir)

    file_path = job_dir + "/" + hpc_jobState_file_wildcart
    current_dir = os.getcwd()
    log("Working on %s" % file_path)
    log("Chdir from current dir %s to %s" % (current_dir, job_dir))
    pUtil.chdir(job_dir)

    try:
        with LockFileWrapper(file_path) as is_locked:
            if not is_locked:
                return False

            from json import load

            with open(file_path) as data_file:
                HPC_state = load(data_file)
            job_state_file = HPC_state["JobStateFile"]
            job_command = HPC_state["JobCommand"]
            # global_work_dir = HPC_state['GlobalWorkingDir']
            JS = JobState()
            JS.get(job_state_file)
            _job, _site, _node, _recoveryAttempt = JS.decode()

            with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger:
                jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus(
                    _job.jobId, DorE(kwargs, "pshttpurl"), DorE(kwargs, "psport"), DorE(kwargs, "pilot_initdir")
                )
                # recover this job?
                if jobStatusCode == 20:
                    log("Received general error code from dispatcher call (leave job for later pilot)")
                    # release the atomic lockfile and go to the next directory
                    # releaseAtomicLockFile(fd, lockfile_name)
                    return False
                elif jobStatus in finalJobStates or "tobekilled" in _job.action:
                    log(
                        "Job %s is currently in state '%s' with attemptNr = %d (according to server - will not"
                        " perform staging out)" % (_job.jobId, jobStatus, jobAttemptNr)
                    )
                    # releaseAtomicLockFile(fd, lockfile_name)
                    return False

                # update job state file at this point to prevent a parallel pilot from doing a simultaneous recovery
                _retjs = pUtil.updateJobState(_job, _site, _node, _recoveryAttempt)
                # releaseAtomicLockFile(fd, lockfile_name)

        monitor = Monitor(env)
        monitor.monitor_recovery_job(_job, _site, _node, job_command, job_state_file, recover_dir=job_dir)

        log("Chdir back to %s" % current_dir)
        pUtil.chdir(current_dir)

        panda_jobs = glob(job_dir + "/PandaJob_*_*")
        panda_logs = glob(job_dir + "/*.log.tgz.*")
        if panda_jobs or panda_logs:
            log(
                "Number of founded panda jobs: %d, number of panda log tar file %d, will not remove job dir"
                % (len(panda_jobs), len(panda_logs))
            )
        else:
            log(
                "Number of founded panda jobs: %d, number of panda log tar file %d, will remove job dir"
                % (len(panda_jobs), len(panda_logs))
            )
            log("Remove job dir %s" % job_dir)
            os.system("rm -rf %s" % job_dir)
        return True
    except:
        log("Failed to start deferred stage out for HPC job: %s" % traceback.format_exc())
        return False
Exemple #8
0
    def cleanup(self):
        """ execute the clean-up """

        status = True
        number_of_cleanups = 0

        if self.clean:
            tolog("Executing empty dirs clean-up, stage 1/5")
            Cleaner.purgeEmptyDirs(self.path)

            tolog("Executing work dir clean-up, stage 2/5")
            Cleaner.purgeWorkDirs(self.path)

            tolog("Executing maxed-out dirs clean-up, stage 3/5")
            Cleaner.purgeMaxedoutDirs(self.path)

            tolog("Executing AthenaMP clean-up, stage 4/5 <SKIPPED>")
            #files = ['AthenaMP_*', 'fifo_*', 'TokenExtractorChannel*', 'zmq_EventService*', 'asetup*', 'tmp*.pkl']
            #for f in files:
            #    Cleaner.purgeFiles(self.path, f, limit=48*3600)

            tolog("Executing PanDA Pilot dir clean-up, stage 5/5")
            JS = JobState()

            # grab all job state files in all work directories
            job_state_files = glob(self.path + "/Panda_Pilot_*/jobState-*.pickle")
            number_of_files = len(job_state_files)
            file_number = 0
            max_cleanups = 30
            tolog("Number of found job state files: %d" % (number_of_files))
            if job_state_files:
                # loop over all found job state files
                for file_path in job_state_files:
                    file_number += 1
                    if file_number > max_cleanups:
                        tolog("Maximum number of job recoveries exceeded for this pilot: %d" % (max_cleanups))
                        break
                    tolog("Processing job state file %d/%d: %s" % (file_number, number_of_files, file_path))
                    current_time = int(time.time())

                    # when was file last modified?
                    try:
                        file_modification_time = os.path.getmtime(file_path)
                    except:
                        # skip this file since it was not possible to read the modification time
                        pass
                    else:
                        # was the job state file updated longer than the time limit? (convert to seconds)
                        mod_time = current_time - file_modification_time
                        if mod_time > self.limit*3600:
                            tolog("File was last modified %d seconds ago (proceed)" % (mod_time))
                            cmd = "whoami; ls -lF %s; ls -lF %s" % (file_path, os.path.dirname(file_path))
                            tolog("Executing command: %s" % (cmd))
                            ec, rs = commands.getstatusoutput(cmd)
                            if ec == 0:
                                tolog("%s" % (rs))
                            else:
                                tolog("!!WARNING!!2999!! %d, %s" % (ec, rs))

                            # open the job state file
                            if JS.get(file_path):
                                # decode the job state info
                                _job, _site, _node, _recoveryAttempt = JS.decode()

                                # add member if it doesn't exist (new Job version)
                                try:
                                    _tmp = _job.prodSourceLabel
                                except:
                                    _job.prodSourceLabel = ''

                                if _job and _site and _node:
                                    # query the job state file for job information
                                    if _job.result[0] == 'running' or _job.result[0] == 'starting' or (_job.result[0] == 'holding' and mod_time > 7*24*3600):
                                        if _job.result[0] == 'holding':
                                            tolog("Job %s was found in %s state but has not been modified for a long time - will be cleaned up" % (_job.jobId, _job.result[0]))
                                        else:
                                            tolog("Job %s was found in %s state - will be cleaned up" % (_job.jobId, _job.result[0]))
                                        tolog("Erasing directory: %s" % (_site.workdir))
                                        cmd = "rm -rf %s" % (_site.workdir)
                                        try:
                                            ec, rs = commands.getstatusoutput(cmd)
                                        except:
                                            tolog("!!WARNING!!5500!! Could not erase lost job workdir: %d, %s" % (ec, rs))
                                            status = False
                                            break
                                        else:
                                            tolog("Lost job workdir removed")
                                    else:
                                        tolog("Job found in state: %s" % (_job.result[0]))
                        else:
                            tolog("File was last modified %d seconds ago (skip)" % (mod_time))
            else:
                tolog("No job state files were found, aborting clean-up")
        else:
            tolog("Clean-up turned off")
            status = False

        return status
Exemple #9
0
    def cleanup(self):
        """ execute the clean-up """

        status = True
        number_of_cleanups = 0

        if self.clean:
            tolog("Executing empty dirs clean-up, stage 1/5")
            Cleaner.purgeEmptyDirs(self.path)

            tolog("Executing work dir clean-up, stage 2/5")
            Cleaner.purgeWorkDirs(self.path)

            tolog("Executing maxed-out dirs clean-up, stage 3/5")
            Cleaner.purgeMaxedoutDirs(self.path)

            tolog("Executing AthenaMP clean-up, stage 4/5 <SKIPPED>")
            #files = ['AthenaMP_*', 'fifo_*', 'TokenExtractorChannel*', 'zmq_EventService*', 'asetup*', 'tmp*.pkl']
            #for f in files:
            #    Cleaner.purgeFiles(self.path, f, limit=48*3600)

            tolog("Executing PanDA Pilot dir clean-up, stage 5/5")
            JS = JobState()

            # grab all job state files in all work directories
            job_state_files = glob(self.path +
                                   "/Panda_Pilot_*/jobState-*.pickle")
            number_of_files = len(job_state_files)
            file_number = 0
            max_cleanups = 30
            tolog("Number of found job state files: %d" % (number_of_files))
            if job_state_files:
                # loop over all found job state files
                for file_path in job_state_files:
                    file_number += 1
                    if file_number > max_cleanups:
                        tolog(
                            "Maximum number of job recoveries exceeded for this pilot: %d"
                            % (max_cleanups))
                        break
                    tolog("Processing job state file %d/%d: %s" %
                          (file_number, number_of_files, file_path))
                    current_time = int(time.time())

                    # when was file last modified?
                    try:
                        file_modification_time = os.path.getmtime(file_path)
                    except:
                        # skip this file since it was not possible to read the modification time
                        pass
                    else:
                        # was the job state file updated longer than the time limit? (convert to seconds)
                        mod_time = current_time - file_modification_time
                        if mod_time > self.limit * 3600:
                            tolog(
                                "File was last modified %d seconds ago (proceed)"
                                % (mod_time))
                            cmd = "whoami; ls -lF %s; ls -lF %s" % (
                                file_path, os.path.dirname(file_path))
                            tolog("Executing command: %s" % (cmd))
                            ec, rs = commands.getstatusoutput(cmd)
                            if ec == 0:
                                tolog("%s" % (rs))
                            else:
                                tolog("!!WARNING!!2999!! %d, %s" % (ec, rs))

                            # open the job state file
                            if JS.get(file_path):
                                # decode the job state info
                                _job, _site, _node, _recoveryAttempt = JS.decode(
                                )

                                # add member if it doesn't exist (new Job version)
                                try:
                                    _tmp = _job.prodSourceLabel
                                except:
                                    _job.prodSourceLabel = ''

                                if _job and _site and _node:
                                    # query the job state file for job information
                                    if _job.result[
                                            0] == 'running' or _job.result[
                                                0] == 'starting' or (
                                                    _job.result[0] == 'holding'
                                                    and
                                                    mod_time > 7 * 24 * 3600):
                                        if _job.result[0] == 'holding':
                                            tolog(
                                                "Job %s was found in %s state but has not been modified for a long time - will be cleaned up"
                                                % (_job.jobId, _job.result[0]))
                                        else:
                                            tolog(
                                                "Job %s was found in %s state - will be cleaned up"
                                                % (_job.jobId, _job.result[0]))
                                        tolog("Erasing directory: %s" %
                                              (_site.workdir))
                                        cmd = "rm -rf %s" % (_site.workdir)
                                        try:
                                            ec, rs = commands.getstatusoutput(
                                                cmd)
                                        except:
                                            tolog(
                                                "!!WARNING!!5500!! Could not erase lost job workdir: %d, %s"
                                                % (ec, rs))
                                            status = False
                                            break
                                        else:
                                            tolog("Lost job workdir removed")
                                    else:
                                        tolog("Job found in state: %s" %
                                              (_job.result[0]))
                        else:
                            tolog(
                                "File was last modified %d seconds ago (skip)"
                                % (mod_time))
            else:
                tolog("No job state files were found, aborting clean-up")
        else:
            tolog("Clean-up turned off")
            status = False

        return status
Exemple #10
0
def DeferredStageoutJob(job_dir, job_state_file="", deferred_stageout_logfile=False,
                        **kwargs):
    """
    Performs stageing out preparation and stages out the job in specified directory.

    :param job_dir:     (string)    directory with a job.
                        mandatory parameter
    :param job_state_file:  (string)    path to job state file or other file containing job state. If empty, job
                                        state file is located as job_dir+'/jobState-*.*'.
                            defaults to ""

    :param deferred_stageout_logfile: (string|False)    template name for deferred log stageout
                                                        Replaces "{job_id}" with current job id like
                                                        "log-{job_id}.txt" -> "log-124124.txt"
                                        Default False

    Other parameters are passed into other functions

    :return: (bool) the fact of stageout being performed
    """
    log("Deferred stageout from job directory \"%s\"" % job_dir)

    job_state = JobState()

    if job_state_file == "":
        try:
            job_state_file = glob(job_dir + "/" + jobState_file_wildcart)[0]
        except:
            log("There is no job state file in the provided directory, exiting")
            return False

    log("Job state file is %s"%job_state_file)

    # lockfd, lockfn = createAtomicLockFile(job_dir)

    with LockFileWrapper(job_dir):
        if not TestJobDirForDeferredStageoutNecessity(job_dir, job_state_file, **kwargs):
            log("Job \"%s\" does not need deferred stageout procedure (yet)" % job_dir)
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        if not job_state.get(job_state_file):
            log("Job state file reading failed, exiting")
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        log("Working with job in \"%s\"" % job_dir)
        _job, _site, _node, _recoveryAttempt = job_state.decode()

        if not (_job and _site and _node):
            log("Can not decode jobState file, exiting")
            # releaseAtomicLockFile(lockfd, lockfn)
            return False

        with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger:

            rc = PrepareJobForDeferredStageout(job_state, **kwargs)

            if rc == ReturnCode.PostJobOnly:
                pUtil.postJobTask(job_state.job, job_state.site, DorE(kwargs, 'workerNode'), DorE(kwargs, 'experiment'),
                                  jr=True, ra=job_state.recoveryAttempt)
                # releaseAtomicLockFile(lockfd, lockfn)
                return True

            if rc > 0:
                log("Job is not prepared for stageout, exiting")
                if rc == ReturnCode.Cleanup:
                    cleanup(job_state)
                # releaseAtomicLockFile(lockfd, lockfn)
                return False

            rc, logfile, datadir, filelist = CreateTransferFileList(job_state, **kwargs)

            XMLStr = ''
            if datadir == "":
                try:
                    XMLStr = job_state.node['xml']
                except:
                    pass

            if XMLStr == '':
                XMLStr = pUtil.getMetadata(job_state.site.workdir, job_state.job.jobId)

            currentdir = os.getcwd()
            pUtil.chdir(job_state.site.workdir)

            if len(filelist):
                log("Stageout will now transfer the files")
                rc = TransferFiles(job_state, datadir, filelist, **kwargs)

                if rc == ReturnCode.Holding:
                    job_state.job.result[0] = "holding"
                if rc == ReturnCode.FailedJob:
                    job_state.job.result[0] = "failed"

                job_state.job.setState(job_state.job.result)

            pUtil.chdir(job_state.site.workdir)
            ret = True
            if logfile != "" and not pUtil.isLogfileCopied(job_state.site.workdir):
                log("Stageout will now transfer the log")
                _log = JobLog()
                ret, _ = _log.transferLogFile(job_state.job, job_state.site, DorE(kwargs, 'experiment'), dest=None,
                                              jr=True)

            if not ret:
                rc = ReturnCode.Holding  # We need to transfer log file regardless the files

            if rc == ReturnCode.OK:
                if pUtil.verifyTransfer(job_state.site.workdir):
                    job_state.job.result[0] = "finished"
                else:
                    job_state.job.result[0] = "failed"
                job_state.job.setState(job_state.job.result)

            if job_state.job.result[0] in finalJobStates:
                job_state.job.final_state = job_state.job.result[0]

            log("Stageout will now update the server with new status")

            rt, retNode = updatePandaServer(job_state, xmlstr=XMLStr, **kwargs)

            if rt == 0:
                log("Job %s updated (exit code %d)" % (job_state.job.jobId, job_state.job.result[2]))

                # did the server send back a command?
                if "tobekilled" in job_state.job.action:
                    log("!!WARNING!!1120!! Panda server returned a \'tobekilled\' command")
                    job_state.job.result[0] = "failed"

                # further recovery attempt unnecessary, but keep the work dir for debugging
                if job_state.job.result[0] == "failed":
                    log("Further recovery attempts will be prevented for failed job (will leave work dir)")
                    if not job_state.rename(job_state.site, job_state.job):
                        log("(Fate of job state file left for next pilot)")

            else:
                log("!!WARNING!!1120!! Panda server returned a %d" % (rt))

                # store the final state so that the next pilot will know

                # store the metadata xml
                retNode['xml'] = XMLStr

                # update the job state file with the new state information
                _retjs = pUtil.updateJobState(job_state.job, job_state.site, retNode, job_state.recoveryAttempt)

            log("Stageout will now proceed to post-job actions")

            if job_state.job.result[0] in finalJobStates:
                pUtil.postJobTask(job_state.job, job_state.site,
                                  DorE(kwargs, 'workerNode'), DorE(kwargs, 'experiment'), jr=True,
                                  ra=job_state.recoveryAttempt)

            pUtil.chdir(currentdir)

            # releaseAtomicLockFile(lockfd, lockfn)

            if job_state.job.result[0] == "finished":
                log("Stageout will now remove the job, it is in finished state and can be removed")
                cleanup(job_state)

            return True
Exemple #11
0
def DeferredStageoutHPCJob(job_dir, deferred_stageout_logfile=False, **kwargs):
    """
    Performs stageing out preparation for the HPC job in specified directory.

    :param job_dir:     (string)    directory with a job.
                        mandatory parameter

    :param deferred_stageout_logfile: (string|False)    template name for deferred log stageout
                                                        Replaces "{job_id}" with current job id like
                                                        "log-{job_id}.txt" -> "log-124124.txt"
                                        Default False

    Other parameters are passed into other functions

    :return: (bool) the fact of stageout being performed
    """
    log("Deferred stageout from HPC job directory \"%s\"" % job_dir)

    file_path = job_dir+"/"+hpc_jobState_file_wildcart
    current_dir = os.getcwd()
    log("Working on %s" % file_path)
    log("Chdir from current dir %s to %s" % (current_dir, job_dir))
    pUtil.chdir(job_dir)

    try:
        with LockFileWrapper(file_path) as is_locked:
            if not is_locked:
                return False

            from json import load
            with open(file_path) as data_file:
                HPC_state = load(data_file)
            job_state_file = HPC_state['JobStateFile']
            job_command = HPC_state['JobCommand']
            # global_work_dir = HPC_state['GlobalWorkingDir']
            JS = JobState()
            JS.get(job_state_file)
            _job, _site, _node, _recoveryAttempt = JS.decode()

            with LogWrapper(deferred_stageout_logfile, _job.jobId) as logger:
                jobStatus, jobAttemptNr, jobStatusCode = pUtil.getJobStatus(_job.jobId, DorE(kwargs,'pshttpurl'),
                                                                            DorE(kwargs,'psport'),
                                                                            DorE(kwargs,'pilot_initdir'))
                # recover this job?
                if jobStatusCode == 20:
                    log("Received general error code from dispatcher call (leave job for later pilot)")
                    # release the atomic lockfile and go to the next directory
                    # releaseAtomicLockFile(fd, lockfile_name)
                    return False
                elif jobStatus in finalJobStates or "tobekilled" in _job.action:
                    log("Job %s is currently in state \'%s\' with attemptNr = %d (according to server - will not"
                                " perform staging out)" % (_job.jobId, jobStatus, jobAttemptNr))
                    # releaseAtomicLockFile(fd, lockfile_name)
                    return False

                # update job state file at this point to prevent a parallel pilot from doing a simultaneous recovery
                _retjs = pUtil.updateJobState(_job, _site, _node, _recoveryAttempt)
                # releaseAtomicLockFile(fd, lockfile_name)

        monitor = Monitor(env)
        monitor.monitor_recovery_job(_job, _site, _node, job_command, job_state_file, recover_dir=job_dir)

        log("Chdir back to %s" % current_dir)
        pUtil.chdir(current_dir)

        panda_jobs = glob(job_dir + "/PandaJob_*_*")
        panda_logs = glob(job_dir + "/*.log.tgz.*")
        if panda_jobs or panda_logs:
            log("Number of founded panda jobs: %d, number of panda log tar file %d, will not remove job dir"
                        % (len(panda_jobs), len(panda_logs)))
        else:
            log("Number of founded panda jobs: %d, number of panda log tar file %d, will remove job dir"
                        % (len(panda_jobs), len(panda_logs)))
            log("Remove job dir %s" % job_dir)
            os.system("rm -rf %s" % job_dir)
        return True
    except:
        log("Failed to start deferred stage out for HPC job: %s" % traceback.format_exc())
        return False