コード例 #1
0
def ReadQueueParams():

    queuedata = ""
    verified = False

    si = SiteInformation()
    qdfname = si.getQueuedataFileName(check=False)
#    qdfname = "%s/queuedata.dat" % (os.environ['PilotHomeDir'])
#    qdfname = "%s/queuedata.json" % (os.environ['PilotHomeDir'])
    if os.path.exists(qdfname):
        print "queuedata already downloaded"
        verified, queuedata = readqueuedata(qdfname, first_trial=True)
    else:
        print "Downloading queuedata in atlasProdPilot"

        # download queuedata and verify it
        extension = utils.getExtension(alternative='dat')
        if extension == "json":
            _ext = extension
        else:
            _ext = "pilot"
        curl_url = 'http://pandaserver.cern.ch:25085/cache/schedconfig/%s.pilot.%s' % (qname, _ext)
        si = SiteInformation()
        sslCertificate = si.getSSLCertificate()
        cmd = 'curl --connect-timeout 20 --max-time 120 --cacert %s -sS "%s" > %s' % (sslCertificate, curl_url, qdfname)
        print "Executing command: %s" % (cmd)
        try:
            ret, rs = commands.getstatusoutput(cmd)
        except Exception, e:
            print "!!WARNING!!1999!! Failed with curl command: %s" % str(e)
        else:
コード例 #2
0
ファイル: GFAL2SiteMover.py プロジェクト: complynx/pilot
    def fixStageInPath(self, path):
        """Fix the path"""

        if path[:3] == "srm" and '?SFN=' in path:
            self.log("Found SFN part in file path: %s" % (path))
        elif path[:3] == "srm":
            try:
                hostname = path.split('/',3)[2]
            except Exception as e:
                self.log("'!!WARNING!!2999!! Could not extract srm protocol for replacement, keeping path variable as it is: %s (%s)' %\
                      (path, str(e))")
            else:
                # srm = 'srm://head01.aglt2.org'
                srm = 'srm://' + hostname

                # does seopt contain any matching srm's?
                sematch = self.getSEMatchFromSEOpt(srm)
                if sematch != "":
                    path = path.replace(srm, sematch)
                    self.log("Replaced %s with %s (from seopt) in path: %s" % (srm, sematch, path))
                else:
                     se = readpar('se').split(",")[0]
                     _dummytoken, se = self.extractSE(se)
                     tolog("Using SE: %s" % (se))

                     path = path.replace(srm, se)
                     self.log("Replaced %s with %s (from se) in path: %s" % (srm, se, path))

                # add port number from se to getfile if necessary
                path = self.addPortToPath(se, path)

        siteInformation = SiteInformation()
        path = siteInformation.getCopyPrefixPath(path, stageIn=True)

        return path
コード例 #3
0
ファイル: LocalSiteMover.py プロジェクト: anisyonk/pilot
    def getStageInMode(self, lfn, prodDBlockToken):
        # should the root file be copied or read directly by athena?
        status = 0
        output={}
        output["errorLog"] = None
        output["report"] = {}
        output["report"]["clientState"] = None

        output["transfer_mode"] = None

        isRootFileName = self.isRootFileName(lfn)

        siteInformation = SiteInformation()
        directIn, transfer_mode = siteInformation.getDirectInAccessMode(prodDBlockToken, isRootFileName)
        if transfer_mode:
            #updateFileState(lfn, workDir, jobId, mode="transfer_mode", state=transfer_mode, type="input")
            output["transfer_mode"] = transfer_mode
        if directIn:
            output["report"]["clientState"] = 'FOUND_ROOT'
            output["report"]['relativeStart'] = None
            output["report"]['transferStart'] = None

            return PilotErrors.ERR_DIRECTIOFILE, output

        return 0, output
コード例 #4
0
    def fixStageOutPath(self, path):
        """Fix the path"""
        statusRet = 0
        outputRet = {}
        outputRet["errorLog"] = None
        outputRet["report"] = {}
        outputRet["report"]["clientState"] = None

        siteInformation = SiteInformation()

        cpt = siteInformation.getCopyTool(stageIn=False)
        tolog("Site mover will use get command: %s, %s" % (cpt))

        # figure out which copyprefix to use (use the PFN to figure out where the file is and then use the appropriate copyprefix)
        # e.g. copyprefix=srm://srm-eosatlas.cern.ch,srm://srm-atlas.cern.ch^root://eosatlas.cern.ch/,root://castoratlas-xrdssl/
        # PFN=srm://srm-eosatlas.cern.ch/.. use copyprefix root://eosatlas.cern.ch/ to build the TURL src_loc_pfn
        # full example:
        # Using copyprefixin = srm://srm-eosatlas.cern.ch,srm://srm-atlas.cern.ch^root://eosatlas.cern.ch/,root://castoratlas-xrdssl/
        # PFN=srm://srm-eosatlas.cern.ch/eos/atlas/atlasdatadisk/rucio/mc12_8TeV/8d/c0/EVNT.01212395._000004.pool.root.1
        # TURL=root://eosatlas.cern.ch//eos/atlas/atlasdatadisk/rucio/mc12_8TeV/8d/c0/EVNT.01212395._000004.pool.root.1

        ret_path = siteInformation.getCopyPrefixPath(path, stageIn=False)
        if not ret_path.startswith("root:"):
            errorLog = "Failed to use copyprefix to convert the current path to local path."
            tolog("!!WARNING!!1777!! %s" % (errorLog))
            outputRet["errorLog"] = errorLog
            outputRet["report"]["clientState"] = 'PSTAGE_FAIL'
            statusRet = PilotErrors.ERR_STAGEINFAILED

        tolog("PFN=%s" % (path))
        tolog("TURL=%s" % (ret_path))
        outputRet['path'] = ret_path

        return statusRet, outputRet
コード例 #5
0
    def getStageInMode(self, lfn, prodDBlockToken):
        # should the root file be copied or read directly by athena?
        status = 0
        output = {}
        output["errorLog"] = None
        output["report"] = {}
        output["report"]["clientState"] = None

        output["transfer_mode"] = None

        isRootFileName = self.isRootFileName(lfn)

        siteInformation = SiteInformation()
        directIn, transfer_mode = siteInformation.getDirectInAccessMode(
            prodDBlockToken, isRootFileName)
        if transfer_mode:
            #updateFileState(lfn, workDir, jobId, mode="transfer_mode", state=transfer_mode, type="input")
            output["transfer_mode"] = transfer_mode
        if directIn:
            output["report"]["clientState"] = 'FOUND_ROOT'
            output["report"]['relativeStart'] = None
            output["report"]['transferStart'] = None

            return PilotErrors.ERR_DIRECTIOFILE, output

        return 0, output
コード例 #6
0
ファイル: xrdcpSiteMover.py プロジェクト: PanDAWMS/pilot
    def fixStageOutPath(self, path):
        """Fix the path"""
        statusRet = 0
        outputRet={}
        outputRet["errorLog"] = None
        outputRet["report"] = {}
        outputRet["report"]["clientState"] = None

        siteInformation = SiteInformation()

        cpt = siteInformation.getCopyTool(stageIn=False)
        tolog("Site mover will use get command: %s, %s" % (cpt))

        # figure out which copyprefix to use (use the PFN to figure out where the file is and then use the appropriate copyprefix)
        # e.g. copyprefix=srm://srm-eosatlas.cern.ch,srm://srm-atlas.cern.ch^root://eosatlas.cern.ch/,root://castoratlas-xrdssl/
        # PFN=srm://srm-eosatlas.cern.ch/.. use copyprefix root://eosatlas.cern.ch/ to build the TURL src_loc_pfn
        # full example:
        # Using copyprefixin = srm://srm-eosatlas.cern.ch,srm://srm-atlas.cern.ch^root://eosatlas.cern.ch/,root://castoratlas-xrdssl/
        # PFN=srm://srm-eosatlas.cern.ch/eos/atlas/atlasdatadisk/rucio/mc12_8TeV/8d/c0/EVNT.01212395._000004.pool.root.1
        # TURL=root://eosatlas.cern.ch//eos/atlas/atlasdatadisk/rucio/mc12_8TeV/8d/c0/EVNT.01212395._000004.pool.root.1

        ret_path = siteInformation.getCopyPrefixPath(path, stageIn=False)
        if not ret_path.startswith("root:"):
            errorLog = "Failed to use copyprefix to convert the current path to local path."
            tolog("!!WARNING!!1777!! %s" % (errorLog))
            outputRet["errorLog"] = errorLog
            outputRet["report"]["clientState"] = 'PSTAGE_FAIL'
            statusRet = PilotErrors.ERR_STAGEINFAILED

        tolog("PFN=%s" % (path))
        tolog("TURL=%s" % (ret_path))
        outputRet['path'] = ret_path

        return statusRet, outputRet
コード例 #7
0
ファイル: GFAL2SiteMover.py プロジェクト: PalNilsson/pilot
    def fixStageInPath(self, path):
        """Fix the path"""

        if path[:3] == "srm" and '?SFN=' in path:
            self.log("Found SFN part in file path: %s" % (path))
        elif path[:3] == "srm":
            try:
                hostname = path.split('/',3)[2]
            except Exception as e:
                self.log("'!!WARNING!!2999!! Could not extract srm protocol for replacement, keeping path variable as it is: %s (%s)' %\
                      (path, str(e))")
            else:
                # srm = 'srm://head01.aglt2.org'
                srm = 'srm://' + hostname

                # does seopt contain any matching srm's?
                sematch = self.getSEMatchFromSEOpt(srm)
                if sematch != "":
                    path = path.replace(srm, sematch)
                    self.log("Replaced %s with %s (from seopt) in path: %s" % (srm, sematch, path))
                else:
                     se = readpar('se').split(",")[0]
                     _dummytoken, se = self.extractSE(se)
                     tolog("Using SE: %s" % (se))

                     path = path.replace(srm, se)
                     self.log("Replaced %s with %s (from se) in path: %s" % (srm, se, path))

                # add port number from se to getfile if necessary
                path = self.addPortToPath(se, path)

        siteInformation = SiteInformation()
        path = siteInformation.getCopyPrefixPath(path, stageIn=True)

        return path
コード例 #8
0
ファイル: RunJobUtilities.py プロジェクト: mlassnig/pilot
def updateCopysetups(cmd3, transferType=None, useCT=None, directIn=None, useFileStager=None):
    """ Update the relevant copysetup fields for remote I/O or file stager """

    si = SiteInformation()

    _copysetupin = readpar('copysetupin')
    _copysetup = readpar('copysetup')

    if _copysetupin != "":
        si.updateCopysetup(cmd3, 'copysetupin', _copysetupin, transferType=transferType, useCT=useCT, directIn=directIn, useFileStager=useFileStager)
    else:
        si.updateCopysetup(cmd3, 'copysetup', _copysetup, transferType=transferType, useCT=useCT, directIn=directIn, useFileStager=useFileStager)
コード例 #9
0
def VerifyQueuedata(verified):
    """verify the information received from the server throught the queuedata info
    """

    if not verified:
        print "Did not download queuedata from server"
    else:
        # is the site online?
        si = SiteInformation()
        status = si.readpar('status')
        if status.upper() == "OFFLINE":
            print "Site is %s, ignore" % (status.upper())
            # sys.exit(2)
        else:
            print "Site is", status
コード例 #10
0
def downloadscript(scripturl, sfile):
    """download the script with curl
    """

    verified = False

    si = SiteInformation()
    sslCertificate = si.getSSLCertificate()
    cmd = "curl  --connect-timeout 20 --max-time 120 --cacert %s %s -s -S -o %s" % (sslCertificate, scripturl, sfile)
    print cmd

    max_trials = 2
    for trial in range(1, max_trials+1):
        st, out = commands.getstatusoutput(cmd)
        print "%s: %s" % (st, out)
        if st != 0:
            print "Error retrieving script with curl (attempt %d/%d)" % (trial, max_trials)
        else:
            cmd = "chmod +x %s; /bin/ls -al %s" % ( sfile, sfile )
            print cmd
            stt, out = commands.getstatusoutput(cmd)
            print "%s: %s" % (stt, out)
            if stt == 0:
                if sfile.find('.tar.gz') > 0:
                    print "Untarring", sfile
                    cmd = "tar xzf %s" % sfile
                    print cmd
                    ec, out = commands.getstatusoutput(cmd)
                    print out
                    if ec != 0:
                        print "tar failed, curl did not return valid archive (attempt %d/%d)" % (trial, max_trials)
                    else:
                        cmd = "chmod +x *.py *.sh"
                        print cmd
                        ec, out = commands.getstatusoutput(cmd)
                        print ec, out
                        verified = True
                        break
                else:
                    verified = True
                    break
            else:
                print "chmod failed (attempt %d/%d)" % (trial, max_trials)
        if verified:
            break

    return verified
コード例 #11
0
def ReportToMonitor(tpid, host):
    """Report to the monitor that this pilot ID is live
    """

    if tpid != "?":
        print "Report pilot ID %s is live" % tpid
        si = SiteInformation()
        sslCertificate = si.getSSLCertificate()
        curl_url = 'http://panda.cern.ch:25980/server/pandamon/query?tpmes=setpilotlive&tpid=%s&host=%s' % (tpid,host)
        curl_cmd = 'curl --connect-timeout 20' \
                   + ' --max-time 120' \
                   + ' --cacert %s' % (sslCertificate) \
                   + ' -sS "%s"' % curl_url
        print curl_cmd
        curl_output = commands.getoutput(curl_cmd)
        print curl_output
    else:
        print "pilot ID is unknown"
コード例 #12
0
def getWorkdir():
    """obtaining the working directory
    """

    si = SiteInformation()
    getworkdir = si.readpar('wntmpdir')

    ## If workdir contains env var, translate it
    pat = re.compile('(.*)(\$[^\/]+)(.*)')
    mat = pat.match(getworkdir)
    if mat:
        envvar = mat.group(2)
        envvar = envvar.replace('$','')
        if os.environ.has_key(envvar):
            envval = os.environ[envvar]
            getworkdir = "%s%s%s" % ( mat.group(1), envval, mat.group(3) )
            print "Translated wntmpdir with env var to", getworkdir
        else:
            print "WARNING: wntmpdir contains env var %s that is undefined" % envvar
 
    return getworkdir
コード例 #13
0
ファイル: RunJobUtilities.py プロジェクト: vokac/pilot
def updateCopysetups(cmd3,
                     transferType=None,
                     useCT=None,
                     directIn=None,
                     useFileStager=None):
    """ Update the relevant copysetup fields for remote I/O or file stager """

    si = SiteInformation()

    _copysetupin = readpar('copysetupin')
    _copysetup = readpar('copysetup')

    if _copysetupin != "":
        si.updateCopysetup(cmd3,
                           'copysetupin',
                           _copysetupin,
                           transferType=transferType,
                           useCT=useCT,
                           directIn=directIn,
                           useFileStager=useFileStager)
    else:
        si.updateCopysetup(cmd3,
                           'copysetup',
                           _copysetup,
                           transferType=transferType,
                           useCT=useCT,
                           directIn=directIn,
                           useFileStager=useFileStager)
コード例 #14
0
def getrunpars(queuedata, workdir, puser, countrygroup, allowothercountry, libcode):

    jobstat = 0

    appdir = ""
    reldir = ""
    sitesetup = ""
    releases = ""
    datadir = ""
    dq2url = ""
    
    queue = os.environ['QueueName']
    print "atlasProdPilot running on queue", queue

    if os.environ.has_key('PANDA_URL_SSL'):
        global baseURLSSL
        baseURLSSL = os.environ['PANDA_URL_SSL']
        pat = re.compile('^(.*):([0-9]+)/.*$')
        mat = pat.match(baseURLSSL)
        if mat:
            pandaURL = mat.group(1)
            pandaPort = mat.group(2)
        else:
            print "!!FAILED!!6999!!Bad Panda URL %s" % baseURLSSL
            jobstat = 1
    else:
        print "!!FAILED!!6999!!PANDA_URL_SSL undefined"
        jobstat = 1

    # reading setup from queuedata
    try:
        # checking if queuedata is not None
        queuedata
    except:
        queuedata = ""
    else:
        si = SiteInformation()
        appdir  = si.getpar('appdir',queuedata)
        datadir = si.getpar('datadir',queuedata)
        dq2url  = si.getpar('dq2url',queuedata)

        par = si.getpar('gatekeeper',queuedata)
        if par != "":
            print "Setting ATLAS_CONDDB to", par
            os.environ["ATLAS_CONDDB"] = par

    print "Setting RUCIO_ACCOUNT to pilot"
    os.environ["RUCIO_ACCOUNT"] = 'pilot'

    if datadir == "":
        if os.environ.has_key("SCRATCH_DIRECTORY"):
            datadir = os.environ["SCRATCH_DIRECTORY"]
        elif os.environ.has_key("OSG_WN_TMP"):
            datadir = os.environ["OSG_WN_TMP"]
        else:
            print "!!WARNING!!2500!!Cannot locate scratch area"
    
    if datadir == "":
        print "datadir not defined"
    else:
        print "datadir:", datadir
    
    if dq2url == "":
        print "dq2url not defined"
    else:
        print "dq2url:", dq2url

    try:
        release = os.environ["swRelease"]
        release = release.replace('Atlas-','')
        print "Release:", release
    except:
        print "swRelease is not defined"
        release = ""

    runpars = [ '-s', site, 
                '-h', queue, 
                '-d', workdir, 
                '-q', dq2url, 
                '-f', 'true', 
                '-w', pandaURL, 
                '-p', pandaPort, 
                '-l', 'true'
              ]

    print "Using source: %s" % (libcode)
    if "pilotcode-dev.tar.gz" in libcode:
        runpars.append('-C')
        runpars.append('0')
        print "Will turn off multi-jobs for dev pilot"

    if appdir != "":
        # Set environment variables the pilot wants
        os.environ["SITEROOT"]="%s/%s" % ( appdir, release )
        runpars.append('-a')
        runpars.append(appdir)
        print "appdir:", appdir
    else:
        print "appdir not defined"

    if puser != "":
        runpars.append('-u')
        runpars.append(puser)

    if countrygroup != "":
        runpars.append('-o')
        runpars.append(countrygroup)

    if allowothercountry != "":
        runpars.append('-A')
        runpars.append(allowothercountry)

    return runpars, jobstat
コード例 #15
0
            _dir = '/etc/grid-security/certificates'
            if os.path.exists(_dir):
                sslCertificatesDirectory = _dir
            else:
                tolog("!!WARNING!!2999!! $X509_CERT_DIR is not set and default location %s does not exist" % (_dir))

        return sslCertificatesDirectory

    def getProperPaths(self):
        """ Return proper paths for the storage element """

        # Implement in sub-class

        return ""

    def getTier1Queue(self, cloud):
        """ Download the queuedata for the Tier-1 in the corresponding cloud and get the queue name """

        # Implement in sub-class
        # This method is used during stage-out to alternative [Tier-1] site when primary stage-out on a Tier-2 fails
        # See methods in ATLASSiteInformation

        return None

if __name__ == "__main__":
    from SiteInformation import SiteInformation
    import os
    os.environ['PilotHomeDir'] = os.getcwd()
    s1 = SiteInformation()
    print "copytool=",s1.readpar('copytool')
コード例 #16
0
class curlSiteMover(SiteMover.SiteMover):
    """ SiteMover for curl """

    copyCommand = "curl"
    checksum_command = "adler32"
    has_mkdir = False
    has_df = False
    has_getsize = False
    has_md5sum = True
    has_chmod = False
    timeout = 3600
    """ get proxy """

    si = SiteInformation()
    sslCert = si.getSSLCertificate()
    sslKey = sslCert
    sslCertDir = si.getSSLCertificatesDirectory()

    def __init__(self, setup_path, *args, **kwrds):
        self._setup = setup_path

    def get_timeout(self):
        return self.timeout

    def check_space(self, ub):
        """ For when space availability is not verifiable """
        return 999999

    def core_get_data(self, envsetup, token, source_surl, dest_path,
                      experiment):
        """ stage-in core function, can be overridden (see stormSiteMover) """

        error = PilotErrors()

        # determine which timeout option to use
        timeout_option = "--connect-timeout 300 --max-time %d" % (self.timeout)

        sslCert = self.sslCert
        sslKey = self.sslKey
        sslCertDir = self.sslCertDir

        # used curl options:
        # --cert: <cert[:passwd]> Client certificate file and password (SSL)
        # --capath: <directory> CA directory (made using c_rehash) to verify
        # --location: Follow Location: hints (H)
        # --output: <file> Write output to <file> instead of stdout
        # --cilent: Makes Curl mute
        # --show-error: When used with -s it makes curl show error message if it fails
        # Removed for SL6: --ciphers <list of ciphers> (SSL)  Specifies  which  ciphers  to use in the connection.
        """ define curl command string """
        _cmd_str = 'lcg-gt %s https' % (source_surl)
        try:
            s, o = commands.getstatusoutput(_cmd_str)
            tolog("Executing command: %s" % (_cmd_str))
        except Exception, e:
            tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" %
                  (str(e), s, o))
            o = str(e)
        if s == 0:
            tolog("lcg-gt supported, get http path")
            source_surl = o.strip().split()
            source_surl = source_surl[0]
            _cmd_str = '%s curl --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % (
                envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey,
                source_surl, dest_path)
#            _cmd_str = '%s curl --ciphers ALL:NULL --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % (envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey, source_surl, dest_path)
        else:
            tolog(
                "lcg-gt not supported, get http path by replacing source_surl")
            _cmd_str = '%s curl --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % (
                envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey,
                source_surl, dest_path)
            #            _cmd_str = '%s curl --ciphers ALL:NULL --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % (envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey, source_surl, dest_path)
            _cmd_str = _cmd_str.replace("srm://", "https://")
        # add the full stage-out command to the job setup script
        #_cmd_str = _cmd_str.replace("file://", "-o ")

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        to_script = _cmd_str
        to_script = to_script.lstrip(' ')  # remove any initial spaces
        if to_script.startswith('/'):
            to_script = 'source ' + to_script
        thisExperiment.updateJobSetupScript(os.path.dirname(dest_path),
                                            to_script=to_script)

        tolog("Executing command: %s" % (_cmd_str))
        s = -1
        o = '(not defined)'
        t0 = os.times()
        try:
            s, o = commands.getstatusoutput(_cmd_str)
        except Exception, e:
            tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" %
                  (str(e), s, o))
            o = str(e)
コード例 #17
0
        surl = surl.replace("s3+rucio", "s3")
        if surl.startswith("root:"):
            sitemover = xrootdObjectstoreSiteMover(self.getSetup())
            return sitemover. put_data(source, destination, fsize, fchecksum, **pdict)
        if surl.startswith("s3:"):
            sitemover = S3ObjectstoreSiteMover(self.getSetup(), self._useTimerCommand)
            return sitemover. put_data(source, surl, fsize, fchecksum, **pdict)
        return -1, "No objectstore sitemover found for this scheme(%s)" % destination, destination, fsize, fchecksum, config_sm.ARCH_DEFAULT


if __name__ == '__main__':

    os.environ['PilotHomeDir'] = os.getcwd()
    from SiteInformation import SiteInformation
    s1 = SiteInformation()
    #s1.getObjectstoresField("os_access_key", "eventservice", queuename='BNL_EC2W2_MCORE')

    f = objectstoreSiteMover()

    gpfn = "nonsens_gpfn"
    lfn = "AOD.310713._000004.pool.root.1"
    path = os.getcwd()
    fsize = "4261010441"
    fchecksum = "9145af38"
    dsname = "data11_7TeV.00177986.physics_Egamma.merge.AOD.r2276_p516_p523_tid310713_00"
    report = {}

    #print f.getGlobalFilePaths(dsname)
    #print f.findGlobalFilePath(lfn, dsname)
    #print f.getLocalROOTSetup()
コード例 #18
0
ファイル: RunJobArgo.py プロジェクト: complynx/pilot
    def executePayload(self, thisExperiment, job):
        
        t0 = os.times() 
        res_tuple = None
        
        # loop over all run commands (only >1 for multi-trfs)
        getstatusoutput_was_interrupted = False
        job_status = None
        tolog("About to launch ARGO job")
        # Poll MQ for Job Status
        try:
            # Initiate MQ interface and send job
            self.argo_job.job_status_routing_key = '%s_job_status' % job.jobId #'status_' + jobID
            si = SiteInformation()
            mi = MessageInterface()
            mi.host = 'atlasgridftp02.hep.anl.gov'
            mi.port = 5671
            mi.ssl_cert = si.getSSLCertificate() #'/grid/atlas/hpc/pilot_certs/xrootdsrv-cert.pem'
            proxy_cert_path = si.getSSLCertificate()
            mi.ssl_cert = os.path.dirname(proxy_cert_path) + "/rabbitmq-cert.pem"
            if 'X509_USER_CERT' in os.environ.keys():
                mi.ssl_cert = os.environ['X509_USER_CERT'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-cert.pem'
            
            mi.ssl_key  = mi.ssl_cert #'/grid/atlas/hpc/pilot_certs/xrootdsrv-key.pem'
            mi.ssl_key = os.path.dirname(proxy_cert_path) + "/rabbitmq-key.pem"
            if 'X509_USER_KEY' in os.environ.keys():
                mi.ssl_key  = os.environ['X509_USER_KEY'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-key.pem'
            
            #mi.ssl_ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem"
            mi.ssl_ca_certs = '/grid/atlas/hpc/pilot_certs/cacerts.pem' 
            #if 'X509_CA_CERTS' in os.environ.keys():
            #    mi.ssl_ca_certs = os.environ['X509_CA_CERTS'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/cacerts.pem'
            #tolog("CA certs: %s" % (mi.ssl_ca_certs))
            ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem"
            if os.path.isfile(ca_certs): 
                mi.ssl_ca_certs = ca_certs
 
            mi.exchange_name = 'argo_users'

            #Create queue to get messages about ARGO Job status from MQ
            tolog('Opening connection with MQ')
            mi.open_blocking_connection()
            tolog('Create queue [%s]  to retrieve messages with job status' % self.argo_job.job_status_routing_key)

            mi.create_queue(self.argo_job.job_status_routing_key, self.argo_job.job_status_routing_key)

            # submit ARGO job to MQ
            
            #tolog('Opening connection with MQ')
            #mi.open_blocking_connection()
            routing_key = 'argo_job'
            if self.dev:
                routing_key = 'argo_job_dev'
            tolog('Sending msg with job to ARGO')
            mi.send_msg(self.argo_job.serialize(), routing_key)
            tolog(' done sending ')
            
            # Waiting till job done or failed    
            ARGO_err_msg = ''
            while True:
                time.sleep(5)
                message = mi.receive_msg(self.argo_job.job_status_routing_key, True)
                if message[2]:
                    tolog ("Got message from queue [%s]: method [%s], properties [%s], body [ %s ]" % (self.argo_job.job_status_routing_key, message[0], message[1], message[2]))
                    job_status = ArgoJobStatus.get_from_message(message[2])
                    job.hpcStatus = job_status.state
                    rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort())

                    tolog("Extracted state: %s" % job_status.state)
                    if job_status.state == job_status.HISTORY:
                        res_tuple = (0, "Done")
                        break
                    elif job_status.is_failed():
                        res_tuple = (1, "Failed")
                        ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message
                    elif job_status.state == job_status.FAILED:
                        res_tuple = (1, "Failed")
                        ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message
                        runJob.failJob(1, 0, job, ins=job.inFiles, pilotErrorDiag=ARGO_err_msg)
                        break
                time.sleep(5)
                  
            mi.close()
            tolog(' closing connection to MQ')
                
            tolog("Job State: %s" % (job_status.state))
            #job.timeExe = int(fork_job.finished - fork_job.started)
                
            ####################################################
    
        except Exception, e:
            tolog("!!FAILED!!3000!! Failed to run command %s" % str(e))
            getstatusoutput_was_interrupted = True
            res_tuple = (1, "Failed")
            self.failJob(0, self.__error.ERR_GENERALERROR, job, pilotErrorDiag=str(e))
コード例 #19
0
ファイル: RunJobArgo.py プロジェクト: jtchilders/pilot
    def executePayload(self, thisExperiment, job):

        t0 = os.times()
        res_tuple = None

        # loop over all run commands (only >1 for multi-trfs)
        getstatusoutput_was_interrupted = False
        job_status = None
        tolog("About to launch ARGO job")
        # Poll MQ for Job Status
        try:
            # Initiate MQ interface and send job
            self.argo_job.job_status_routing_key = '%s_job_status' % job.jobId  #'status_' + jobID
            si = SiteInformation()
            mi = MessageInterface()
            mi.host = 'atlasgridftp02.hep.anl.gov'
            mi.port = 5671
            mi.ssl_cert = si.getSSLCertificate(
            )  #'/grid/atlas/hpc/pilot_certs/xrootdsrv-cert.pem'
            proxy_cert_path = si.getSSLCertificate()
            mi.ssl_cert = os.path.dirname(
                proxy_cert_path) + "/rabbitmq-cert.pem"
            if 'X509_USER_CERT' in os.environ.keys():
                mi.ssl_cert = os.environ[
                    'X509_USER_CERT']  #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-cert.pem'

            mi.ssl_key = mi.ssl_cert  #'/grid/atlas/hpc/pilot_certs/xrootdsrv-key.pem'
            mi.ssl_key = os.path.dirname(proxy_cert_path) + "/rabbitmq-key.pem"
            if 'X509_USER_KEY' in os.environ.keys():
                mi.ssl_key = os.environ[
                    'X509_USER_KEY']  #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-key.pem'

            #mi.ssl_ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem"
            mi.ssl_ca_certs = '/grid/atlas/hpc/pilot_certs/cacerts.pem'
            #if 'X509_CA_CERTS' in os.environ.keys():
            #    mi.ssl_ca_certs = os.environ['X509_CA_CERTS'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/cacerts.pem'
            #tolog("CA certs: %s" % (mi.ssl_ca_certs))
            ca_certs = os.path.dirname(
                proxy_cert_path) + "/rabbitmq-cacerts.pem"
            if os.path.isfile(ca_certs):
                mi.ssl_ca_certs = ca_certs

            mi.exchange_name = 'argo_users'

            #Create queue to get messages about ARGO Job status from MQ
            tolog('Opening connection with MQ')
            mi.open_blocking_connection()
            tolog('Create queue [%s]  to retrieve messages with job status' %
                  self.argo_job.job_status_routing_key)

            mi.create_queue(self.argo_job.job_status_routing_key,
                            self.argo_job.job_status_routing_key)

            # submit ARGO job to MQ

            #tolog('Opening connection with MQ')
            #mi.open_blocking_connection()
            routing_key = 'argo_job'
            if self.dev:
                routing_key = 'argo_job_dev'
            tolog('Sending msg with job to ARGO')
            mi.send_msg(self.argo_job.serialize(), routing_key)
            tolog(' done sending ')

            # Waiting till job done or failed
            ARGO_err_msg = ''
            while True:
                time.sleep(5)
                message = mi.receive_msg(self.argo_job.job_status_routing_key,
                                         True)
                if message[2]:
                    tolog(
                        "Got message from queue [%s]: method [%s], properties [%s], body [ %s ]"
                        % (self.argo_job.job_status_routing_key, message[0],
                           message[1], message[2]))
                    job_status = ArgoJobStatus.get_from_message(message[2])
                    job.hpcStatus = job_status.state
                    rt = RunJobUtilities.updatePilotServer(
                        job, self.getPilotServer(), self.getPilotPort())

                    tolog("Extracted state: %s" % job_status.state)
                    if job_status.state == job_status.HISTORY:
                        res_tuple = (0, "Done")
                        break
                    elif job_status.is_failed():
                        res_tuple = (1, "Failed")
                        ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message
                    elif job_status.state == job_status.FAILED:
                        res_tuple = (1, "Failed")
                        ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message
                        runJob.failJob(1,
                                       0,
                                       job,
                                       ins=job.inFiles,
                                       pilotErrorDiag=ARGO_err_msg)
                        break
                time.sleep(5)

            mi.close()
            tolog(' closing connection to MQ')

            tolog("Job State: %s" % (job_status.state))
            #job.timeExe = int(fork_job.finished - fork_job.started)

            ####################################################

        except Exception, e:
            tolog("!!FAILED!!3000!! Failed to run command %s" % str(e))
            getstatusoutput_was_interrupted = True
            res_tuple = (1, "Failed")
            self.failJob(0,
                         self.__error.ERR_GENERALERROR,
                         job,
                         pilotErrorDiag=str(e))