Python PilotErrors 예제들, PilotErrors.PilotErrors Python 예제들

예제 #1

0

파일 보기

파일: S3ObjectstorePresignedURLSiteMover.py 프로젝트: PanDAWMS/pilot

    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ copy input file from SE to local dir """

        error = PilotErrors()

        # Get input parameters from pdict
        jobId = pdict.get('jobId', '')
        workDir = pdict.get('workDir', '')
        experiment = pdict.get('experiment', '')
        pandaProxySecretKey = pdict.get('pandaProxySecretKey')
        jobSetID = pdict.get('jobsetID')

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 's3objectstorepresignedurl', lfn, guid)

        if path == '': path = './'
        fullname = os.path.join(path, lfn)

        # pandaID, filename, jobSetID, pandaProxySecretKey=None, stageIn=True
        status, output = self.stageIn(jobId, lfn, jobSetID, pandaProxySecretKey, fullname, fsize, fchecksum, experiment)

        if status == 0:
            updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", type="input")
            state = "DONE"
        else:
            errors = PilotErrors()
            state = errors.getErrorName(status)
            if state == None:
                state = "PSTAGE_FAIL"

        # self.__sendReport(state, report)
        self.prepareReport(state, report)
        return status, output

예제 #2

0

파일 보기

파일: S3ObjectstoreSiteMover.py 프로젝트: anisyonk/pilot

    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ copy input file from SE to local dir """

        error = PilotErrors()

        # Get input parameters from pdict
        jobId = pdict.get('jobId', '')
        workDir = pdict.get('workDir', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)

        # try to get the direct reading control variable (False for direct reading mode; file should not be copied)
        useCT = pdict.get('usect', True)
        prodDBlockToken = pdict.get('access', '')

        # get the DQ2 tracing report
        report = self.getStubTracingReport(pdict['report'], 's3objectstore', lfn, guid)

        if path == '': path = './'
        fullname = os.path.join(path, lfn)

        status, output = self.stageIn(gpfn, fullname, fsize, fchecksum, experiment)

        if status == 0:
            updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", type="input")
            state = "DONE"
        else:
            errors = PilotErrors()
            state = errors.getErrorName(status)
            if state == None:
                state = "PSTAGE_FAIL"

        self.prepareReport(state, report)
        return status, output

예제 #3

0

파일 보기

파일: S3ObjectstoreSiteMover.py 프로젝트: PanDAWMS/pilot

    def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict):
        """ copy output file from disk to local SE """
        # function is based on dCacheSiteMover put function

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        token = pdict.get('token', '')
        scope = pdict.get('scope', '')
        dsname = pdict.get('dsname', '')
        experiment = pdict.get('experiment', '')
        outputDir = pdict.get('outputDir', '')
        os_bucket_id = pdict.get('os_bucket_id', -1)
        timeout = pdict.get('timeout', None)
        if not timeout:
            timeout = self.timeout

        # get the site information object
        si = getSiteInformation(experiment)

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 's3objectstore', lfn, guid)

        parsed = urlparse.urlparse(destination)
        scheme = parsed.scheme
        hostname = parsed.netloc.partition(':')[0]
        port = int(parsed.netloc.partition(':')[2])
        report['remoteSite'] = '%s://%s:%s' % (scheme, hostname, port)

        filename = os.path.basename(source)
        surl = destination
        self.log("surl=%s, timeout=%s" % (surl, timeout))
        if "log.tgz" in surl:
            surl = surl.replace(lfn, "%s:%s"%(scope,lfn))
        else:
            report['eventType'] = 'put_es'

        status, output, size, checksum = self.stageOut(source, surl, token, experiment, outputDir=outputDir, timeout=timeout, os_bucket_id=os_bucket_id, report=report)
        if status !=0:
            errors = PilotErrors()
            state = errors.getErrorName(status)
            if state == None:
                state = "PSTAGE_FAIL"
            # self.__sendReport(state, report)
            self.prepareReport(state, report)
            return self.put_data_retfail(status, output, surl)

        state = "DONE"
        # self.__sendReport(state, report)
        self.prepareReport(state, report)
        return 0, pilotErrorDiag, surl, size, checksum, self.arch_type

예제 #4

0

파일 보기

파일: S3ObjectstorePresignedURLSiteMover.py 프로젝트: PanDAWMS/pilot

    def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict):
        """ copy output file from disk to local SE """
        # function is based on dCacheSiteMover put function

        error = PilotErrors()
        pilotErrorDiag = ""


        # Get input parameters from pdict
        alt = pdict.get('alt', False)
        jobId = pdict.get('jobId', '')
        jobSetID = pdict.get('jobsetID', '')
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        #token = pdict.get('token', '')
        scope = pdict.get('scope', '')
        dsname = pdict.get('dsname', '')
        analysisJob = pdict.get('analJob', False)
        testLevel = pdict.get('testLevel', '0')
        extradirs = pdict.get('extradirs', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)
        prodSourceLabel = pdict.get('prodSourceLabel', '')
        outputDir = pdict.get('outputDir', '')
        timeout = pdict.get('timeout', None)
        pandaProxySecretKey = pdict.get('pandaProxySecretKey')
        if not timeout:
            timeout = self.timeout

        # get the site information object
        si = getSiteInformation(experiment)

        tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel))

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 's3objectstorepresignedurl', lfn, guid)

        filename = os.path.basename(source)
        surl = destination
        status, output, size, checksum = self.stageOut(source, jobId, lfn, jobSetID, pandaProxySecretKey, experiment, outputDir=outputDir, timeout=timeout)
        if status !=0:
            errors = PilotErrors()
            state = errors.getErrorName(status)
            if state == None:
                state = "PSTAGE_FAIL"
            # self.__sendReport(state, report)
            self.prepareReport(state, report)
            return self.put_data_retfail(status, output, surl)

        state = "DONE"
        # self.__sendReport(state, report)
        # self.prepareReport(state, report)
        return 0, pilotErrorDiag, surl, size, checksum, self.arch_type

예제 #5

0

파일 보기

파일: S3ObjectstoreSiteMover.py 프로젝트: anisyonk/pilot

    def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict):
        """ copy output file from disk to local SE """
        # function is based on dCacheSiteMover put function

        error = PilotErrors()
        pilotErrorDiag = ""


        # Get input parameters from pdict
        alt = pdict.get('alt', False)
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        token = pdict.get('token', '')
        scope = pdict.get('scope', '')
        dsname = pdict.get('dsname', '')
        analysisJob = pdict.get('analJob', False)
        testLevel = pdict.get('testLevel', '0')
        extradirs = pdict.get('extradirs', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)
        prodSourceLabel = pdict.get('prodSourceLabel', '')

        # get the site information object
        si = getSiteInformation(experiment)

        tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel))
        if prodSourceLabel == 'ddm' and analysisJob:
            tolog("Treating PanDA Mover job as a production job during stage-out")
            analysisJob = False

        # get the DQ2 tracing report
        report = self.getStubTracingReport(pdict['report'], 's3objectstore', lfn, guid)


        filename = os.path.basename(source)
        surl = destination
        status, output, size, checksum = self.stageOut(source, surl, token, experiment)
        if status !=0:
            errors = PilotErrors()
            state = errors.getErrorName(status)
            if state == None:
                state = "PSTAGE_FAIL"
            self.prepareReport(state, report)
            return self.put_data_retfail(status, output, surl)

        state = "DONE"
        self.prepareReport(state, report)
        return 0, pilotErrorDiag, surl, size, checksum, self.arch_type

예제 #6

0

파일 보기

파일: S3SiteMover.py 프로젝트: PanDAWMS/pilot

    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ copy input file from SE to local dir """

        error = PilotErrors()

        # Get input parameters from pdict
        jobId = pdict.get('jobId', '')
        workDir = pdict.get('workDir', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)
        os_bucket_id = pdict.get('os_bucket_id', -1)

        # try to get the direct reading control variable (False for direct reading mode; file should not be copied)
        useCT = pdict.get('usect', True)
        prodDBlockToken = pdict.get('access', '')

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 'gfal-copy', lfn, guid)

        if path == '': path = './'
        fullname = os.path.join(path, lfn)

        # get the site information object
        si = getSiteInformation(experiment)
        ret_path = si.getCopyPrefixPathNew(gpfn, stageIn=True)
        if not ret_path.startswith("s3:"):
            errorLog = "Failed to use copyprefix to convert the current path to S3 path."
            tolog("!!WARNING!!1777!! %s" % (errorLog))
            status = PilotErrors.ERR_STAGEINFAILED
            state = "PSTAGE_FAIL"
            output = errorLog
        else:
            gpfn = ret_path
            status, output = self.stageIn(gpfn, fullname, fsize, fchecksum, experiment, os_bucket_id=os_bucket_id)

        if status == 0:
            updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", ftype="input")
            state = "DONE"
        else:
            errors = PilotErrors()
            state = errors.getErrorName(status)
            if state == None:
                state = "PSTAGE_FAIL"

        self.prepareReport(state, report)
        return status, output

예제 #7

0

파일 보기

파일: RunJobUtilities.py 프로젝트: mlassnig/pilot

def getFinalState(result):
    """
    Figure out the final job state (finished or failed)
    Simplies job recovery
    """

    state = "failed"

    # job has failed if transExitCode != 0
    if result[1] != 0:
        state = "failed"
    else:
        error = PilotErrors()
        # job has finished if pilotErrorCode is in the allowed list or recoverable jobs
        if ((error.isRecoverableErrorCode(result[2])) or (result[2] == error.ERR_KILLSIGNAL and result[0] == "holding")):
            state = "finished"

    return state

예제 #8

0

파일 보기

파일: S3ObjectstoreSiteMover.py 프로젝트: PanDAWMS/pilot

    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ copy input file from SE to local dir """

        error = PilotErrors()

        # Get input parameters from pdict
        jobId = pdict.get('jobId', '')
        workDir = pdict.get('workDir', '')
        experiment = pdict.get('experiment', '')
        os_bucket_id = pdict.get('os_bucket_id', -1)

        # try to get the direct reading control variable (False for direct reading mode; file should not be copied)
        useCT = pdict.get('usect', True)
        prodDBlockToken = pdict.get('access', '')

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 's3objectstore', lfn, guid)

        if path == '': path = './'
        fullname = os.path.join(path, lfn)

        status, output = self.stageIn(gpfn, fullname, fsize, fchecksum, experiment, os_bucket_id=os_bucket_id, report=report)
        report['eventType'] = 'get_es'

        parsed = urlparse.urlparse(gpfn)
        scheme = parsed.scheme
        hostname = parsed.netloc.partition(':')[0]
        port = int(parsed.netloc.partition(':')[2])
        report['remoteSite'] = '%s://%s:%s' % (scheme, hostname, port)

        if status == 0:
            updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", ftype="input")
            state = "DONE"
        else:
            errors = PilotErrors()
            state = errors.getErrorName(status)
            if state == None:
                state = "PSTAGE_FAIL"

        # self.__sendReport(state, report)
        self.prepareReport(state, report)
        return status, output

예제 #9

0

파일 보기

파일: S3SiteMover.py 프로젝트: complynx/pilot

    def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict):
        """ copy output file from disk to local SE """
        # function is based on dCacheSiteMover put function

        error = PilotErrors()
        pilotErrorDiag = ""


        # Get input parameters from pdict
        alt = pdict.get('alt', False)
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        token = pdict.get('token', '')
        scope = pdict.get('scope', '')
        dsname = pdict.get('dsname', '')
        analysisJob = pdict.get('analJob', False)
        testLevel = pdict.get('testLevel', '0')
        extradirs = pdict.get('extradirs', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)
        prodSourceLabel = pdict.get('prodSourceLabel', '')

        # get the site information object
        si = getSiteInformation(experiment)

        tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel))
        if prodSourceLabel == 'ddm' and analysisJob:
            tolog("Treating PanDA Mover job as a production job during stage-out")
            analysisJob = False

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 'gfal-copy', lfn, guid)

        filename = os.path.basename(source)

        # get all the proper paths
        ec, pilotErrorDiag, tracer_error, dst_gpfn, lfcdir, surl = si.getProperPaths(error, analysisJob, token, prodSourceLabel, dsname, filename, scope=scope, alt=alt, sitemover=self) # quick workaround
        if ec != 0:
            self.prepareReport(tracer_error, report)
            return self.put_data_retfail(ec, pilotErrorDiag)

        # get local adler32 checksum
        status, output, adler_size, adler_checksum = self.getLocalFileInfo(source, checksumType="adler32")
        if status != 0:
            errorLog = 'Failed to get local file %s adler32 checksum: %s' % (source, output)
            tolog("!!WARNING!!1777!! %s" % (errorLog))
            status = PilotErrors.ERR_STAGEINFAILED
            state = "PSTAGE_FAIL"
            output = errorLog
            self.prepareReport(state, report)
            return self.put_data_retfail(status, output, surl)

        ret_path = si.getCopyPrefixPathNew(surl, stageIn=False)
        tolog("Convert destination: %s to new path: %s" % (surl, ret_path))
        if not ret_path.startswith("s3:"):
            errorLog = "Failed to use copyprefix to convert the current path to S3 path."
            tolog("!!WARNING!!1777!! %s" % (errorLog))
            status = PilotErrors.ERR_STAGEINFAILED
            state = "PSTAGE_FAIL"
            output = errorLog
            size = None
            checksum = None
        else:
            status, output, size, checksum = self.stageOut(source, ret_path, token, experiment)

        if status !=0:
            errors = PilotErrors()
            state = errors.getErrorName(status)
            if state == None:
                state = "PSTAGE_FAIL"
            self.prepareReport(state, report)
            return self.put_data_retfail(status, output, surl)
        else:
            if size == adler_size:
                tolog("The file size is not changed. Will check whether adler32 changed.")
                status, output, new_adler_size, new_adler_checksum = self.getLocalFileInfo(source, checksumType="adler32")
                if status != 0:
                    errorLog = 'Failed to get local file %s adler32 checksum: %s' % (source, output)
                    tolog("!!WARNING!!1777!! %s" % (errorLog))
                    status = PilotErrors.ERR_STAGEINFAILED
                    state = "PSTAGE_FAIL"
                    output = errorLog
                    self.prepareReport(state, report)
                    return self.put_data_retfail(status, output, surl)
                else:
                    if adler_checksum == new_adler_checksum:
                        tolog("The file checksum is not changed. Will use adler32 %s to replace the md5 checksum %s" % (adler_checksum, checksum))
                        checksum = adler_checksum
                    else:
                        errorLog = "The file checksum changed from %s(before transfer) to %s(after transfer)" % (adler_checksum, new_adler_checksum)
                        tolog("!!WARNING!!1777!! %s" % (errorLog))
                        status = PilotErrors.ERR_STAGEINFAILED
                        state = "PSTAGE_FAIL"
                        output = errorLog
                        self.prepareReport(state, report)
                        return self.put_data_retfail(status, output, surl)

        state = "DONE"
        self.prepareReport(state, report)
        return 0, pilotErrorDiag, surl, size, checksum, self.arch_type

예제 #10

0

파일 보기

class ATLASSiteInformation(SiteInformation):

    # private data members
    __experiment = "ATLAS"
    __instance = None
    __error = PilotErrors()                  # PilotErrors object
    __securityKeys = {}
    __benchmarks = None

    # Required methods

    def __init__(self):
        """ Default initialization """

        pass

    def __new__(cls, *args, **kwargs):
        """ Override the __new__ method to make the class a singleton """

        if not cls.__instance:
            cls.__instance = super(ATLASSiteInformation, cls).__new__(cls, *args, **kwargs)

        return cls.__instance

    def getExperiment(self):
        """ Return a string with the experiment name """

        return self.__experiment

    def isTier1(self, sitename):
        """ Is the given site a Tier-1? """
        # E.g. on a Tier-1 site, the alternative stage-out algorithm should not be used
        # Note: sitename is PanDA sitename, not Rucio sitename (RSE)

        status = False

        for cloud in self.getCloudList():
            if sitename in self.getTier1List(cloud):
                status = True
                break
        return status

    def isTier2(self, sitename):
        """ Is the given site a Tier-2? """
        # Logic: it is a T2 if it is not a T1 or a T3

        return (not (self.isTier1(sitename) or self.isTier3()))

    def isTier3(self):
        """ Is the given site a Tier-3? """
        # Note: defined by DB

        if readpar('ddm') == "local":
            status = True
        else:
            status = False

        return status

    def getCloudList(self):
        """ Return a list of all clouds """

        tier1 = self.setTier1Info()
        return tier1.keys()

    def setTier1Info(self):
        """ Set the Tier-1 information """

        tier1 = {"CA": ["TRIUMF", ""],
                 "CERN": ["CERN-PROD", ""],
                 "DE": ["FZK-LCG2", ""],
                 "ES": ["pic", ""],
                 "FR": ["IN2P3-CC", ""],
                 "IT": ["INFN-T1", ""],
                 "ND": ["ARC", ""],
                 "NL": ["SARA-MATRIX", ""],
                 "OSG": ["BNL_CVMFS_1", ""],
                 "RU": ["RRC-KI-T1", ""],
                 "TW": ["Taiwan-LCG2", ""],
                 "UK": ["RAL-LCG2", ""],
                 "US": ["BNL_PROD", "BNL_PROD-condor"]
                 }
        return tier1

    def getTier1Name(self, cloud):
        """ Return the the site name of the Tier 1 """

        return self.getTier1List(cloud)[0]

    def getTier1List(self, cloud):
        """ Return a Tier 1 site/queue list """
        # Cloud : PanDA site, queue

        tier1 = self.setTier1Info()
        return tier1[cloud]

    def getTier1InfoFilename(self):
        """ Get the Tier-1 info file name """

        filename = "Tier-1_info.%s" % (getExtension())
        path = "%s/%s" % (os.environ['PilotHomeDir'], filename)

        return path

    def downloadTier1Info(self):
        """ Download the Tier-1 info file """

        ec = 0

        path = self.getTier1InfoFilename()
        filename = os.path.basename(path)
        dummy, extension = os.path.splitext(filename)

        # url = "http://adc-ssb.cern.ch/SITE_EXCLUSION/%s" % (filename)
        if extension == ".json":
            _cmd = "?json"
#            _cmd = "?json&preset=ssbpilot"
        else:
            _cmd = "?preset=ssbpilot"
        url = "http://atlas-agis-api.cern.ch/request/site/query/list/%s" % (_cmd)
        cmd = 'curl --connect-timeout 20 --max-time 120 -sS "%s" > %s' % (url, path)

        if os.path.exists(path):
            tolog("File %s already available" % (path))
        else:
            tolog("Will download file: %s" % (filename))

            try:
                tolog("Executing command: %s" % (cmd))
                ret, output = commands.getstatusoutput(cmd)
            except Exception, e:
                tolog("!!WARNING!!1992!! Could not download file: %s" % (e))
                ec = -1
            else:

예제 #11

0

파일 보기

파일: rfcpLFCSiteMover.py 프로젝트: virthead/COMPASS-multijob-pilot

    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """
        The local file is assubed to have a relative path that is the same of the relative path in the 'gpfn'
        loc_... are the variables used to access the file in the locally exported file system
        """

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        useCT = pdict.get('usect', True)
        jobId = pdict.get('jobId', '')
        workDir = pdict.get('workDir', '')
        prodDBlockToken = pdict.get('access', '')

        # get the DQ2 tracing report
        try:
            report = pdict['report']
        except:
            report = {}
        else:
            # set the proper protocol
            report['protocol'] = 'rfcpLFC'
            # mark the relative start
            report['relativeStart'] = time()
            # the current file
            report['filename'] = lfn
            # guid
            report['guid'] = guid.replace('-', '')

        tolog("gpfn is %s" % gpfn)

        # get a proper envsetup
        envsetup = self.getEnvsetup(get=True)

        if self._setup:
            _setup_str = "source %s; " % self._setup
        else:
            _setup_str = envsetup

        ec, pilotErrorDiag = verifySetupCommand(error, _setup_str)
        if ec != 0:
            self.prepareReport('RFCP_FAIL', report)
            return ec, pilotErrorDiag

        # remove any host and SFN info from PFN path
        loc_pfn = self.extractPathFromPFN(gpfn)

        try:
            if not loc_pfn.startswith(('/dpm', '/castor')):
                tolog(
                    "Potential problem with local filename. Does not start with '/dpm' or '/castor/'."
                )
        except TypeError:
            # Older version of python
            pass

        # should the root file be copied or read directly by athena?
        directIn, useFileStager = self.getTransferModes()
        if directIn:
            if useCT:
                directIn = False
                tolog(
                    "Direct access mode is switched off (file will be transferred with the copy tool)"
                )
                updateFileState(lfn,
                                workDir,
                                jobId,
                                mode="transfer_mode",
                                state="copy_to_scratch",
                                type="input")
            else:
                # determine if the file is a root file according to its name
                rootFile = self.isRootFileName(lfn)

                if prodDBlockToken == 'local' or not rootFile:
                    directIn = False
                    tolog(
                        "Direct access mode has been switched off for this file (will be transferred with the copy tool)"
                    )
                    updateFileState(lfn,
                                    workDir,
                                    jobId,
                                    mode="transfer_mode",
                                    state="copy_to_scratch",
                                    type="input")
                elif rootFile:
                    tolog(
                        "Found root file according to file name: %s (will not be transferred in direct reading mode)"
                        % (lfn))
                    report['relativeStart'] = None
                    report['transferStart'] = None
                    self.prepareReport('FOUND_ROOT', report)
                    if useFileStager:
                        updateFileState(lfn,
                                        workDir,
                                        jobId,
                                        mode="transfer_mode",
                                        state="file_stager",
                                        type="input")
                    else:
                        updateFileState(lfn,
                                        workDir,
                                        jobId,
                                        mode="transfer_mode",
                                        state="remote_io",
                                        type="input")
                    return error.ERR_DIRECTIOFILE, pilotErrorDiag
                else:
                    tolog("Normal file transfer")

        dest_path = os.path.join(path, lfn)
        #PN
        _cmd_str = '%srfcp %s %s' % (_setup_str, loc_pfn, dest_path)
        #        if ".lib." in loc_pfn:
        #            _cmd_str = '%srfcp %s %s' % (_setup_str, loc_pfn, dest_path)
        #        else:
        #            _cmd_str = '%srfcpXXX %s %s' % (_setup_str, loc_pfn, dest_path)
        tolog("Executing command: %s" % (_cmd_str))
        report['transferStart'] = time()

        # execute
        timeout = 3600
        try:
            s, telapsed, cout, cerr = timed_command(_cmd_str, timeout)
        except Exception, e:
            pilotErrorDiag = 'timed_command() threw an exception: %s' % (e)
            tolog("!!WARNING!!1111!! %s" % (pilotErrorDiag))
            s = 1
            o = str(e)
            telapsed = timeout

예제 #12

0

파일 보기

파일: rfcpLFCSiteMover.py 프로젝트: virthead/COMPASS-multijob-pilot

    def put_data(self,
                 source,
                 ddm_storage,
                 fsize=0,
                 fchecksum=0,
                 dsname='',
                 **pdict):
        """ Data transfer using rfcp - generic version
        It's not advisable to use this right now because there's no
        easy way to register the srm space token if the file is 
        copied with rfcp"""

        error = PilotErrors()
        pilotErrorDiag = ""

        tolog("put_data() got ddm_storage=%s" % (ddm_storage))

        # Get input parameters from pdict
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        dsname = pdict.get('dsname', '')
        analJob = pdict.get('analJob', False)
        extradirs = pdict.get('extradirs', '')

        if self._setup:
            _setup_str = "source %s; " % self._setup
        else:
            _setup_str = ''

        # get the DQ2 tracing report
        try:
            report = pdict['report']
        except:
            report = {}
        else:
            # set the proper protocol
            report['protocol'] = 'rfpLFC'
            # mark the relative start
            report['relativeStart'] = time()
            # the current file
            report['filename'] = lfn
            # guid
            report['guid'] = guid.replace('-', '')

        # At destination append a subdirectory which is first two fields of dsname, or 'other'
        destination = readpar('sepath')
        if destination == '':
            pilotErrorDiag = "put_data destination path in SE not defined"
            tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag))
            self.prepareReport('DEST_PATH_UNDEF', report)
            return self.put_data_retfail(error.ERR_STAGEOUTFAILED,
                                         pilotErrorDiag)
        if dsname == '':
            pilotErrorDiag = "Dataset name not specified to put_data"
            tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag))
            self.prepareReport('DSN_UNDEF', report)
            return self.put_data_retfail(error.ERR_STAGEOUTFAILED,
                                         pilotErrorDiag)
#        else:
#            dsname = self.remove_sub(dsname)
#            tolog("dsname: %s" % (dsname))

#        report['dataset'] = dsname

        pat = re.compile('([^\.]+\.[^\.]+)\..*')
        mat = pat.match(dsname)
        if mat:
            prefixdir = mat.group(1)
            destination = os.path.join(destination, prefixdir)
        else:
            pilotErrorDiag = "put_data encountered unexpected dataset name format: %s" % (
                dsname)
            tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag))
            self.prepareReport('DSN_FORMAT_FAIL', report)
            return self.put_data_retfail(error.ERR_STAGEOUTFAILED,
                                         pilotErrorDiag)

        # preparing variables
        src_pfn = source
        if fsize == 0 or fchecksum == 0:
            ec, pilotErrorDiag, fsize, fchecksum = self.getLocalFileInfo(
                src_pfn, csumtype="adler32")
        if ec != 0:
            self.prepareReport('LOCAL_FILE_INFO_FAIL', report)
            return SiteMover.SiteMover.put_data_retfail(ec, pilotErrorDiag)

        # now that the file size is known, add it to the tracing report
        report['filesize'] = fsize

        dst_se = destination
        if (
                dst_se.find('SFN') != -1
        ):  # srm://dcsrm.usatlas.bnl.gov:8443/srm/managerv1?SFN=/pnfs/usatlas.bnl.gov/
            s = dst_se.split('SFN=')
            dst_loc_se = s[1]
            dst_prefix = s[0] + 'SFN='
        else:
            _sentries = dst_se.split('/', 3)
            dst_serv = _sentries[0] + '//' + _sentries[
                2]  # 'method://host:port' is it always a ftp server? can it be srm? something else?
            dst_host = _sentries[2]  #host and port
            dst_loc_se = '/' + _sentries[3]
            dst_prefix = dst_serv

        filename = os.path.basename(source)

        # Behavior as in BNL: user files have no dsname automatically added to dir name
        m = re.search('^user', filename)
        if m:
            dsname = ''

        dst_loc_sedir = os.path.join(dst_loc_se,
                                     os.path.join(extradirs, dsname))
        copyprefix = readpar('copyprefix')
        tolog('copyprefix: %s' % (copyprefix))
        if copyprefix != '':
            # Replace prefix on pfn
            pfrom, pto = copyprefix.split('^')
            tolog("Replacing %s with %s on %s" % (pfrom, pto, dst_loc_sedir))
            dst_loc_sedir = dst_loc_sedir.replace(pfrom, pto)

        dst_loc_pfn = os.path.join(dst_loc_sedir, filename)
        dst_gpfn = dst_prefix + dst_loc_pfn

        # get the DQ2 site name from ToA
        try:
            _dq2SiteName = self.getDQ2SiteName(surl=dst_gpfn)
        except Exception, e:
            tolog(
                "Warning: Failed to get the DQ2 site name: %s (can not add this info to tracing report)"
                % str(e))

예제 #13

0

파일 보기

파일: RunJobHopper.py 프로젝트: virthead/COMPASS-multijob-pilot

        # JEM job-end callback
        try:
            from JEMstub import notifyJobEnd2JEM
            notifyJobEnd2JEM(job, tolog)
        except:
            pass  # don't care (fire and forget)

        return res, job, getstatusoutput_was_interrupted, current_job_number


if __name__ == "__main__":

    tolog("Starting RunJobHopper")
    # Get error handler
    error = PilotErrors()

    # Get runJob object
    runJob = RunJobHopper()

    # Setup HPC specific parameters for Edison

    runJob.cpu_number_per_node = 24
    runJob.walltime = 120
    runJob.max_nodes = 10
    runJob.number_of_threads = 1
    runJob.min_walltime = 10  # minutes
    runJob.waittime = 15  # minutes
    runJob.nodes = 2
    runJob.partition_comp = 'hopper'
    runJob.project_id = ""

예제 #14

0

파일 보기

class LocalSiteMover(SiteMover.SiteMover):
    """ SiteMover that uses lsm for both get and put """
    # no registration is done
    copyCommand = "lsm"
    realCopyCommand = "lsm-get"
    checksum_command = "adler32"
    timeout = 3600
    __warningStr = '!!WARNING!!2995!! %s'
    __spacetoken = '-t %s'  # space token descriptor
    __localget = '%s lsm-get %s %s %s'  # environment, options, lfn, target directory
    __localput = '%s lsm-put %s %s %s'  # environment, space token (optional), source directory, destination
    __localputBAD = '%s lsm-put %s %s %s'  # environment, space token (optional), source directory, destination
    __localspace = '%s lsm-df %s %s'  # environment, space token (optional), storage end-point
    __par_filesize = ' --size %s'  # filesize in bytes
    __par_checksum = ' --checksum %s'  # checksum string: "adler32:NNN", "md5:NNN", default is assumed MD5
    __timeout = 5400  # seconds
    __error = PilotErrors()
    __pilotErrorDiag = ''

    def __init__(self, setup_path, *args, **kwrds):
        self._setup = setup_path.strip()
        self.__isSetuped = False
        self._defaultSetup = None

    def get_timeout(self):
        return self.timeout

    def log(self, errorLog):
        tolog(errorLog)

    def getSetup(self):
        """ Return the setup string (pacman setup os setup script) for the copy command used by the mover """
        _setup_str = ""
        self._setup = self._setup.strip()
        tolog("self setup: %s" % self._setup)

        if self._setup and self._setup != "" and self._setup.strip() != "":
            if not self._setup.endswith(";"):
                self._setup += ";"
            if not "alias" in self._setup:
                if "atlasLocalSetup.sh" in self._setup and "--quiet" not in self._setup:
                    self._setup = self._setup.replace(
                        "atlasLocalSetup.sh", "atlasLocalSetup.sh --quiet")
                if self._setup.startswith("export") or self._setup.startswith(
                        "source"):
                    _setup_str = "%s" % self._setup
                else:
                    _setup_str = "source %s" % self._setup
            else:
                _setup_str = self._setup

        if _setup_str != "":
            tolog("Using setup: %s" % (_setup_str))

        return _setup_str

    def verifySetupCommand(self, _setupStr):
        """ Make sure the setup command exists """

        statusRet = 0
        outputRet = {}
        outputRet["errorLog"] = None
        outputRet["report"] = {}
        outputRet["report"]["clientState"] = None

        # remove any '-signs
        _setupStr = _setupStr.replace("'", "")
        self.log("Will verify: %s" % (_setupStr))

        if _setupStr != "" and "source " in _setupStr:
            # first extract the file paths from the source command(s)
            setupPaths = extractFilePaths(_setupStr)

            # only run test if string begins with an "/"
            if setupPaths:
                # verify that the file paths actually exists
                for setupPath in setupPaths:
                    if "-" in setupPath:
                        continue
                    if os.path.exists(setupPath):
                        self.log("File %s has been verified" % (setupPath))
                    else:
                        outputRet[
                            "errorLog"] = errorLog = "No such file or directory: %s" % (
                                setupPath)
                        self.log('!!WARNING!!2991!! %s' % (errorLog))
                        statusRet = PilotErrors.ERR_NOSUCHFILE
                        break
            else:
                # nothing left to test
                pass
        else:
            self.log(
                "Nothing to verify in setup: %s (either empty string or no source command)"
                % (_setupStr))

        return statusRet, outputRet

    def verifySetupProxy(self, _setupStr, experiment):
        #check do we have a valid proxy

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        status, output = thisExperiment.verifyProxy(envsetup=_setupStr)
        return status, output

    def verifySetup(self, _setupStr, experiment, proxycheck=True):
        statusRet, outputRet = self.verifySetupCommand(_setupStr)
        if statusRet != 0:
            #self.prepareReport('RFCP_FAIL', self._variables['report'])
            outputRet["report"]["clientState"] = "RFCP_FAIL"
            return statusRet, outputRet

        command = _setupStr
        if command != "" and not command.endswith(';'):
            command = command + ";"
        command += " which " + self.realCopyCommand
        status, output = commands.getstatusoutput(command)
        self.log("Execute command:  %s" % command)
        self.log("Status: %s, Output: %s" % (status, output))
        if status != 0:
            self.log(self.copyCommand + " is not found in envsetup: " +
                     _setupStr)
            #self.prepareReport('RFCP_FAIL', self._variables['report'])
            outputRet["report"]["clientState"] = "RFCP_FAIL"
            outputRet["errorLog"] = output
            return status, outputRet

        if proxycheck:
            status, outputLog = self.verifySetupProxy(_setupStr, experiment)
            if status != 0:
                outputRet["errorLog"] = outputLog
                outputRet["report"]["clientState"] = 'PROXYFAIL'
                return status, outputRet

        return status, outputRet

    def setup(self, experiment):
        """ setup env """
        if self.__isSetuped:
            return 0, None
        thisExperiment = getExperiment(experiment)
        self.useTracingService = thisExperiment.useTracingService()

        _setupStr = self.getSetup()

        # get the user proxy if available
        envsetupTest = _setupStr.strip()
        if envsetupTest != "" and not envsetupTest.endswith(';'):
            envsetupTest += ";"
        if os.environ.has_key('X509_USER_PROXY'):
            envsetupTest += " export X509_USER_PROXY=%s;" % (
                os.environ['X509_USER_PROXY'])

        self.log("to verify site setup: %s " % envsetupTest)
        status, output = self.verifySetup(envsetupTest, experiment)
        self.log("site setup verifying: status: %s, output: %s" %
                 (status, output["errorLog"]))
        if status == 0:
            self._setup = envsetupTest
            self.__isSetuped = True
            return status, output
        else:
            if self._defaultSetup:
                #try to use default setup
                self.log("Try to use default envsetup")
                envsetupTest = self._defaultSetup.strip()
                if envsetupTest != "" and not envsetupTest.endswith(';'):
                    envsetupTest += ";"
                if os.environ.has_key('X509_USER_PROXY'):
                    envsetupTest += " export X509_USER_PROXY=%s;" % (
                        os.environ['X509_USER_PROXY'])

                self.log("verify default setup: %s " % envsetupTest)
                status, output = self.verifySetup(envsetupTest, experiment)
                self.log("default setup verifying: status: %s, output: %s" %
                         (status, output["errorLog"]))
                if status == 0:
                    self._setup = envsetupTest
                    self.__isSetuped = True
                    return status, output

        return status, output

    def fixStageInPath(self, path):
        """Fix the path"""

        if path[:3] == "srm" and '?SFN=' in path:
            self.log("Found SFN part in file path: %s" % (path))
        elif path[:3] == "srm":
            try:
                hostname = path.split('/', 3)[2]
            except Exception as e:
                self.log(
                    "'!!WARNING!!2999!! Could not extract srm protocol for replacement, keeping path variable as it is: %s (%s)' %\
                      (path, str(e))")
            else:
                # srm = 'srm://head01.aglt2.org'
                srm = 'srm://' + hostname

                # does seopt contain any matching srm's?
                sematch = self.getSEMatchFromSEOpt(srm)
                if sematch != "":
                    path = path.replace(srm, sematch)
                    self.log("Replaced %s with %s (from seopt) in path: %s" %
                             (srm, sematch, path))
                else:
                    se = readpar('se').split(",")[0]
                    _dummytoken, se = self.extractSE(se)
                    tolog("Using SE: %s" % (se))

                    path = path.replace(srm, se)
                    self.log("Replaced %s with %s (from se) in path: %s" %
                             (srm, se, path))

                # add port number from se to getfile if necessary
                path = self.addPortToPath(se, path)
        return path

    def getStageInMode(self, lfn, prodDBlockToken, transferType):
        # should the root file be copied or read directly by athena?
        status = 0
        output = {}
        output["errorLog"] = None
        output["report"] = {}
        output["report"]["clientState"] = None

        output["transfer_mode"] = None

        isRootFileName = self.isRootFileName(lfn)

        siteInformation = SiteInformation()
        directIn, transfer_mode = siteInformation.getDirectInAccessMode(
            prodDBlockToken, isRootFileName, transferType)
        if transfer_mode:
            output["transfer_mode"] = transfer_mode
        if directIn:
            output["report"]["clientState"] = 'FOUND_ROOT'
            output["report"]['relativeStart'] = None
            output["report"]['transferStart'] = None

            return PilotErrors.ERR_DIRECTIOFILE, output

        return 0, output

    def stageInFile(self,
                    source,
                    destination,
                    sourceSize,
                    sourceChecksum,
                    guid=None):
        """StageIn the file. should be implementated by different site mover."""
        statusRet = 0
        outputRet = {}
        outputRet["errorLog"] = None
        outputRet["report"] = {}
        outputRet["report"]["clientState"] = None

        # build the parameters
        _params = ""
        if sourceSize != 0 and sourceSize != "0":
            _params += self.__par_filesize % (sourceSize)
        if sourceChecksum and sourceChecksum != 'None' and sourceChecksum != 0 and sourceChecksum != "0" and not self.isDummyChecksum(
                sourceChecksum):
            csumtype = self.getChecksumType(sourceChecksum)
            # special case for md5sum (command only understands 'md5' and 'adler32', and not 'ad' and 'md5sum')
            if csumtype == 'md5sum':
                csumtype = 'md5'
            _params += self.__par_checksum % ("%s:%s" %
                                              (csumtype, sourceChecksum), )
        # add the guid option
        _params += " --guid %s" % (guid)

        self.log("StageIn files started.")
        _cmd_str = self.__localget % (self._setup, _params, source,
                                      destination)
        self.log('Executing command: %s' % (_cmd_str))
        s = -1
        o = '(not defined)'
        t0 = os.times()
        outputRet["report"]['relativeStart'] = time()
        outputRet["report"]['transferStart'] = time()
        try:
            timerCommand = TimerCommand(_cmd_str)
            s, o = timerCommand.run(timeout=self.timeout)
        except Exception, e:
            tolog("!!WARNING!!2990!! Exception caught by stageInFile(): %s" %
                  (str(e)))
            o = str(e)
        t1 = os.times()
        t = t1[4] - t0[4]
        self.log("Command finished after %f s: %s" % (t, o.replace('\n', ' ')))

        if s == 0:
            self.log("Stagein succeeded")
        else:
            self.log("!!WARNING!!2990!! Command failed: %s" % (_cmd_str))
            o = o.replace('\n', ' ')
            #check_syserr(s, o)
            self.log("!!WARNING!!2990!! get_data failed. Status=%s Output=%s" %
                     (s, str(o)))

            # remove the local file before any get retry is attempted
            _status = self.removeLocal(destination)
            if not _status:
                self.log(
                    "!!WARNING!!1112!! Failed to remove local file, get retry will fail"
                )

            status, output = self.errorToReport(o,
                                                t,
                                                source,
                                                stageMethod="stageIN")
            return status, output

        #outputRet["report"]["clientState"] = "DONE"
        return statusRet, outputRet

예제 #15

0

파일 보기

파일: OtherExperiment.py 프로젝트: virthead/COMPASS-multijob-pilot

class OtherExperiment(Experiment):

    # private data members
    __experiment = "Other"
    __instance = None
    __error = PilotErrors()  # PilotErrors object
    __doFileLookups = False  # True for LFC based file lookups (basically a dummy data member here since singleton object is static)
    __cache = ""  # Cache URL used e.g. by LSST

    # Required methods

    def __init__(self):
        """ Default initialization """

        # e.g. self.__errorLabel = errorLabel
        pass

    def __new__(cls, *args, **kwargs):
        """ Override the __new__ method to make the class a singleton """

        if not cls.__instance:
            cls.__instance = super(OtherExperiment,
                                   cls).__new__(cls, *args, **kwargs)

        return cls.__instance

    def getExperiment(self):
        """ Return a string with the experiment name """

        return self.__experiment

    def setParameters(self, *args, **kwargs):
        """ Set any internally needed variables """

        # set initial values
        self.__job = kwargs.get('job', None)
        if self.__job:
            self.__analysisJob = isAnalysisJob(self.__job.trf)

    def getJobExecutionCommand(self):
        """ Define and test the command(s) that will be used to execute the payload """
        # E.g. cmd = "source <path>/setup.sh; <path>/python "

        cmd = ""

        return cmd

    def willDoFileLookups(self):
        """ Should (LFC) file lookups be done by the pilot or not? """

        return False

    def willDoFileRegistration(self):
        """ Should (LFC) file registration be done by the pilot or not? """

        return False

    def doFileLookups(self, doFileLookups):
        """ Update the file lookups boolean """

        # Only implement this method if class really wants to update the __doFileLookups boolean
        # ATLAS wants to implement this, but not CMS
        # Method is used by Mover
        # self.__doFileLookups = doFileLookups
        pass

    def isOutOfMemory(self, **kwargs):
        """ Try to identify out of memory errors in the stderr/out """

        return False

    def getNumberOfEvents(self, **kwargs):
        """ Return the number of events """

        return 0

    def specialChecks(self, **kwargs):
        """ Implement special checks here """
        # Return False if fatal failure, otherwise return True
        # The pilot will abort if this method returns a False

        status = False

        tolog("No special checks for \'%s\'" % (self.__experiment))

        return True  # obviously change this to 'status' once implemented

    # Optional
    def setCache(self, cache):
        """ Cache URL """
        # Used e.g. by LSST

        self.__cache = cache

    # Optional
    def getCache(self):
        """ Return the cache URL """
        # Used e.g. by LSST

        return self.__cache

    # Optional
    def useTracingService(self):
        """ Use the DQ2 Tracing Service """
        # A service provided by the DQ2 system that allows for file transfer tracking; all file transfers
        # are reported by the pilot to the DQ2 Tracing Service if this method returns True

        return False

예제 #16

0

파일 보기

class ChirpSiteMover(SiteMover.SiteMover):
    """ SiteMover for CHIRP copy commands etc """

    copyCommand = "chirp"
    checksum_command = "adler32"
    __warningStr = '!!WARNING!!2995!! %s'
    __chirp = 'chirp -t 300 %s %s < %s' # options,server, command file
    __timeout = 300 # seconds
    __error = PilotErrors()
    __pilotErrorDiag = ''
    __MAX_FILE_SIZE = 200*1024**2

    def get_timeout(self):
        return self.__timeout

    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ copy input file from SE to local dir """

        # try to get the direct reading control variable (False for direct reading mode; file should not be copied)
        useCT = pdict.get('usect', True)
        prodDBlockToken = pdict.get('access', '')

        # get the Rucio tracing report
        try:
            report = pdict['report']
        except:
            report = {}
        else:
            # set the proper protocol
            report['protocol'] = 'local'
            # mark the relative start
            report['relativeStart'] = time()
            # the current file
            report['filename'] = lfn
            # guid
            report['guid'] = guid.replace('-','')

        if not path:
            tolog('path is empty, using current directory')
            path = os.getcwd()

        # build setup string
        envsetup = self.getEnvsetup(get=True)

        # should the root file be copied or read directly by athena?
        directIn = False
        dInfo = getDirectAccessDic(readpar('copysetupin'))
        # if copysetupin did not contain direct access info, try the copysetup instead
        if not dInfo:
            dInfo = getDirectAccessDic(readpar('copysetup'))

        tolog("dInfo: %s" % str(dInfo))
        # check if we should use the copytool
        if dInfo:
            directIn = dInfo['directIn']

        if directIn:
            if useCT:
                directIn = False
                tolog("Direct access mode is switched off (file will be transferred with the copy tool)")
            else:
                # determine if the file is a root file according to its name
                rootFile = self.isRootFileName(lfn)

                if prodDBlockToken == 'local' or not rootFile:
                    directIn = False
                    tolog("Direct access mode has been switched off for this file (will be transferred with the copy tool)")
                elif rootFile:
                    tolog("Found root file according to file name: %s (will not be transferred in direct reading mode)" % (lfn))
                    report['relativeStart'] = None
                    report['transferStart'] = None
                    self.prepareReport('FOUND_ROOT', report)
                    return 0, self.__pilotErrorDiag
                else:
                    tolog("Normal file transfer")
        else:
            tolog("not directIn")

        # build the get command
        _params = ""
        if fchecksum and fchecksum != 'None' and fchecksum != 0 and fchecksum != "0" and not self.isDummyChecksum(fchecksum):
            csumtype = self.getChecksumType(fchecksum)
            # special case for md5sum (command only understands 'md5' and 'adler32', and not 'ad' and 'md5sum')
            if csumtype == 'md5sum':
                csumtype = 'md5'

        execStr = self.__localget % (envsetup, _params, gpfn, os.path.join(path, lfn))
        tolog("Executing command: %s" % (execStr))

        report['transferStart'] = time()
        try:
            status, telapsed, cout, cerr = timed_command(execStr, self.__timeout)
        except Exception, e:
            self.__pilotErrorDiag = 'timed_command() threw an exception: %s' % str(e)
            tolog(self.__warningStr % self.__pilotErrorDiag)
            status = 1
            output = str(e)
            telapsed = self.__timeout
        else:

예제 #17

0

파일 보기

파일: PandaServerClient.py 프로젝트: anisyonk/pilot

                    node['exeErrorCode'] = job.exeErrorCode
            else:
                node['transExitCode'] = job.result[1]
            if (job.result[0] == 'failed') and (job.exeErrorCode != 0) and (job.result[1] != job.exeErrorCode):
                if log:
                    mismatch = "MISMATCH | Trf error code mismatch: exeErrorCode = %d, transExitCode = %d" %\
                               (job.exeErrorCode, job.result[1])
                    if node.has_key('pilotLog'):
                        node['pilotLog'] = mismatch + node['pilotLog']
                    else:
                        tolog("!!WARNING!!1300!! Could not write mismatch error to log extracts: %s" % mismatch)

            # check if Pilot-controlled resubmission is required:
            if (job.result[0] == "failed" and 'ANALY' in site.sitename):
                pilotExitCode = job.result[2]
                error = PilotErrors()
                if (error.isPilotResubmissionErrorCode(pilotExitCode) or job.isPilotResubmissionRequired):
                    # negate PilotError, ensure it's negative
                    job.result[2] = -abs(pilotExitCode)
                    tolog("(Negated error code)")
                else:
                    tolog("(No need to negate error code)")

            node['pilotErrorCode'] = job.result[2]
            tolog("Pilot error code: %d" % (node['pilotErrorCode']))

            # report CPUTime and CPUunit at the end of the job
            node['cpuConsumptionTime'] = job.cpuConsumptionTime
            try:
                node['cpuConsumptionUnit'] = job.cpuConsumptionUnit + "+" + getCPUmodel()
            except:

예제 #18

0

파일 보기

파일: PandaServerClient.py 프로젝트: bbockelm/CAFUtilities

    def getNodeStructure(self, job, site, workerNode, spaceReport=False, log=None):
        """ define the node structure expected by the server """

        node = {}

        node['node'] = workerNode.nodename
        node['workdir'] = job.workdir
        node['siteName'] = site.sitename
        node['jobId'] = job.jobId
        node['state'] = job.result[0]
        node['timestamp'] = timeStamp()
        if job.attemptNr > -1:
            node['attemptNr'] = job.attemptNr
        if self.__jobSchedulerId:
            node['schedulerID'] = self.__jobSchedulerId
        if self.__pilotId:
            # report the batch system job id, if available
            batchSystemType, _id = getBatchSystemJobID()
            if batchSystemType:
                tolog("Batch system: %s" % (batchSystemType))
                tolog("Batch system job ID: %s" % (_id))
                node['pilotID'] = "%s|%s|%s|%s|%s" % (self.__pilotId, _id, batchSystemType, self.__pilot_version_tag, self.__pilot_version)
                node['batchID'] = _id
                tolog("Will send batchID: %s and pilotID: %s" % (node['batchID'], node['pilotID']))
            else:
                tolog("Batch system type was not identified (will not be reported)")
                node['pilotID'] = "%s|%s|%s" % (self.__pilotId, self.__pilot_version_tag, self.__pilot_version)
                tolog("Will send pilotID: %s" % (node['pilotID']))
            tolog("pilotId: %s" % str(self.__pilotId)) 
        if log and (job.result[0] == 'failed' or job.result[0] == 'holding' or "outbound connections" in log):
            node['pilotLog'] = log

        # build the jobMetrics
        node['jobMetrics'] = self.getJobMetrics(job, workerNode)

        # send pilotErrorDiag for finished, failed and holding jobs
        if job.result[0] == 'finished' or job.result[0] == 'failed' or job.result[0] == 'holding':
            # get the pilot error diag
            if job.pilotErrorDiag:
                if job.pilotErrorDiag == "":
                    node['pilotErrorDiag'] = tailPilotErrorDiag(self.__error.getPilotErrorDiag(job.result[2]))
                    job.pilotErrorDiag = node['pilotErrorDiag']
                    tolog("Empty pilotErrorDiag set to: %s" % (job.pilotErrorDiag))
                elif job.pilotErrorDiag.upper().find("<HTML>") >= 0:
                    tolog("Found html in pilotErrorDiag: %s" % (job.pilotErrorDiag))
                    node['pilotErrorDiag'] = self.__error.getPilotErrorDiag(job.result[2])
                    job.pilotErrorDiag = node['pilotErrorDiag']
                    tolog("Updated pilotErrorDiag: %s" % (job.pilotErrorDiag))
                else:
                    # truncate if necesary
                    if len(job.pilotErrorDiag) > 250:
                        tolog("pilotErrorDiag will be truncated to size 250")
                        tolog("Original pilotErrorDiag message: %s" % (job.pilotErrorDiag))
                        job.pilotErrorDiag = job.pilotErrorDiag[:250]
                    # set the pilotErrorDiag, but only the last 256 characters
                    node['pilotErrorDiag'] = tailPilotErrorDiag(job.pilotErrorDiag)
            else:
                # set the pilotErrorDiag, but only the last 256 characters
                job.pilotErrorDiag = self.__error.getPilotErrorDiag(job.result[2])
                node['pilotErrorDiag'] = tailPilotErrorDiag(job.pilotErrorDiag)
                tolog("Updated pilotErrorDiag from None: %s" % (job.pilotErrorDiag))

            # get the number of events
            if job.nEvents != 0:
                node['nEvents'] = job.nEvents
                tolog("Total number of processed events: %d (read)" % (job.nEvents))
            else:
                tolog("runJob did not report on the total number of read events")

        if job.result[0] == 'finished' or job.result[0] == 'failed':
            # make sure there is no mismatch between the transformation error codes (when both are reported)
            # send transformation errors depending on what is available
            if job.exeErrorDiag != "":
                node['exeErrorCode'] = job.exeErrorCode
                node['exeErrorDiag'] = job.exeErrorDiag
            else:
                node['transExitCode'] = job.result[1]
            if (job.result[0] == 'failed') and (job.exeErrorCode != 0) and (job.result[1] != job.exeErrorCode):
                if log:
                    mismatch = "MISMATCH | Trf error code mismatch: exeErrorCode = %d, transExitCode = %d" %\
                               (job.exeErrorCode, job.result[1])
                    if node.has_key('pilotLog'):
                        node['pilotLog'] = mismatch + node['pilotLog']
                    else:
                        tolog("!!WARNING!!1300!! Could not write mismatch error to log extracts: %s" % mismatch)

            # check if Pilot-controlled resubmission is required:
            if (job.result[0] == "failed" and 'ANALY' in site.sitename):
                pilotExitCode = job.result[2]
                error = PilotErrors()
                if (error.isPilotResubmissionErrorCode(pilotExitCode) or job.isPilotResubmissionRequired):
                    # negate PilotError, ensure it's negative
                    job.result[2] = -abs(pilotExitCode)
                    tolog("(Negated error code)")
                else:
                    tolog("(No need to negate error code)")

            node['pilotErrorCode'] = job.result[2]
            tolog("Pilot error code: %d" % (node['pilotErrorCode']))

            # report CPUTime and CPUunit at the end of the job
            node['cpuConsumptionTime'] = job.cpuConsumptionTime
            try:
                node['cpuConsumptionUnit'] = job.cpuConsumptionUnit + "+" + getCPUmodel()
            except:
                node['cpuConsumptionUnit'] = '?'
            node['cpuConversionFactor'] = job.cpuConversionFactor

            # report specific time measures
            # node['pilotTiming'] = "getJob=%s setup=%s stageIn=%s payload=%s stageOut=%s" % (job.timeGetJob, job.timeSetup, job.timeStageIn, job.timeExe, job.timeStageOut)
            node['pilotTiming'] = "%s|%s|%s|%s|%s" % (job.timeGetJob, job.timeStageIn, job.timeExe, job.timeStageOut, job.timeSetup)
#            node['pilotTiming'] = "%s|%s|%s|%s|%s" % (str(job.timeGetJob), str(job.timeStageIn), str(job.timeExe), str(job.timeStageOut), str(job.timeSetup))
        elif job.result[0] == 'holding':
            node['exeErrorCode'] = job.result[2]
            node['exeErrorDiag'] = self.__error.getPilotErrorDiag(job.result[2])

        else:
            node['cpuConsumptionUnit'] = getCPUmodel()

        if spaceReport and site.dq2space != -1: # non-empty string and the space check function runs well
            node['remainingSpace'] = site.dq2space
            node['messageLevel'] = site.dq2spmsg

        return node