Exemple #1
0
    def getTier1Queue2(self, cloud):
        """ Download the queuedata for the Tier-1 in the corresponding cloud and get the queue name """

        queuename = ""

        path = self.getTier1InfoFilename()
        ec = self.downloadTier1Info()
        if ec == 0:
            # Process the downloaded T-1 info
            f = open(path, 'r')
            if getExtension() == "json":
                from json import loads
                data = loads(f.read())
            else:
                from pickle import load
                data = load(f)
            f.close()

            # Extract the relevant queue info for the given cloud
            T1_info = [x for x in data if x['cloud']==cloud]

            # finally get the queue name
            if T1_info != []:
                info = T1_info[0]
                if info.has_key('PanDAQueue'):
                    queuename = info['PanDAQueue']
                else:
                    tolog("!!WARNING!!1222!! Returned Tier-1 info object does not have key PanDAQueue: %s" % str(info))
            else:
                tolog("!!WARNING!!1223!! Found no Tier-1 info for cloud %s" % (cloud))

        return queuename
Exemple #2
0
    def rename(self, site, job):
        """
        Rename the job state file. Should only be called for
        holding jobs that have passed the maximum number of recovery attempts.
        """
        status = True

        # get the file extension
        extension = getExtension()

        fileNameOld = "%s/jobState-%s.%s" % (site.workdir, job.jobId, extension)
        fileNameNew = "%s/jobState-%s.%s.MAXEDOUT" % (site.workdir, job.jobId, extension)
        if os.path.isfile(fileNameOld):
            # rename the job state file
            try:
                os.system("mv %s %s" % (fileNameOld, fileNameNew))
            except OSError:
                tolog("JOBSTATE FAILURE: Failed to rename job state file: %s" % (fileNameOld))
                status = False
            else:
                tolog("Job state file renamed to: %s" % (fileNameNew))
        else:
            tolog("JOBSTATE FAILURE: Job state file does not exist: %s" % (fileNameOld))
            status = False
            
        return status
Exemple #3
0
    def getTier1InfoFilename(self):
        """ Get the Tier-1 info file name """

        filename = "Tier-1_info.%s" % (getExtension())
        path = "%s/%s" % (os.environ['PilotHomeDir'], filename)

        return path
def ReadQueueParams():

    queuedata = ""
    verified = False

    si = SiteInformation()
    qdfname = si.getQueuedataFileName(check=False)
#    qdfname = "%s/queuedata.dat" % (os.environ['PilotHomeDir'])
#    qdfname = "%s/queuedata.json" % (os.environ['PilotHomeDir'])
    if os.path.exists(qdfname):
        print "queuedata already downloaded"
        verified, queuedata = readqueuedata(qdfname, first_trial=True)
    else:
        print "Downloading queuedata in atlasProdPilot"

        # download queuedata and verify it
        extension = utils.getExtension(alternative='dat')
        if extension == "json":
            _ext = extension
        else:
            _ext = "pilot"
        curl_url = 'http://pandaserver.cern.ch:25085/cache/schedconfig/%s.pilot.%s' % (qname, _ext)
        si = SiteInformation()
        sslCertificate = si.getSSLCertificate()
        cmd = 'curl --connect-timeout 20 --max-time 120 --cacert %s -sS "%s" > %s' % (sslCertificate, curl_url, qdfname)
        print "Executing command: %s" % (cmd)
        try:
            ret, rs = commands.getstatusoutput(cmd)
        except Exception, e:
            print "!!WARNING!!1999!! Failed with curl command: %s" % str(e)
        else:
    def getQueuedataFileName(self, useExtension=None, check=True, alt=False):
        """ Define the queuedata filename """

        # use a forced extension if necessary
        if useExtension:
            extension = useExtension
        else:
            extension = getExtension(alternative='dat')

        # prepend alt. for alternative stage-out site queuedata
        if alt:
            extension = "alt." + extension

        path = "%s/queuedata.%s" % (os.environ['PilotHomeDir'], extension)

        # remove the json extension if the file cannot be found (complication due to wrapper)
        if not os.path.exists(path) and check:
            if extension == 'json':
                _path = path.replace('.json', '.dat')
                if os.path.exists(_path):
                    tolog("Updating queuedata file name to: %s" % (_path))
                    path = _path
                else:
                    tolog("!!WARNING!! Queuedata paths do not exist: %s, %s" % (path, _path))
            if extension == 'dat':
                _path = path.replace('.dat', '.json')
                if os.path.exists(_path):
                    tolog("Updating queuedata file name to: %s" % (_path))
                    path = _path
                else:
                    tolog("!!WARNING!! Queuedata paths do not exist: %s, %s" % (path, _path))
        return path
Exemple #6
0
def getTrfExitInfo(exitCode, workdir):
    """ Get the trf exit code and info from job report if possible """

    exitAcronym = ""
    exitMsg = ""

    # does the job report exist?
    extension = getExtension(alternative='pickle')
    if extension.lower() == "json":
        filename = os.path.join(workdir, "jobReport.%s" % (extension))
    else:
        filename = os.path.join(workdir, "jobReportExtract.%s" % (extension))
    if os.path.exists(filename):
        tolog("Found job report: %s" % (filename))

        # search for the exit code
        try:
            f = open(filename, "r")
        except Exception, e:
            tolog("!!WARNING!!1112!! Failed to open job report: %s" % (e))
        else:
            if extension.lower() == "json":
                from json import load
            else:
                from pickle import load
            data = load(f)

            # extract the exit code and info
            _exitCode = extractDictionaryObject("exitCode", data)
            if _exitCode:
                if _exitCode == 0 and exitCode != 0:
                    tolog("!!WARNING!!1111!! Detected inconsistency in %s: exitcode listed as 0 but original trf exit code was %d (using original error code)" %\
                          (filename, exitCode))
                else:
                    exitCode = _exitCode
            _exitAcronym = extractDictionaryObject("exitAcronym", data)
            if _exitAcronym:
                exitAcronym = _exitAcronym
            _exitMsg = extractDictionaryObject("exitMsg", data)
            if _exitMsg:
                exitMsg = _exitMsg

            f.close()

            tolog("Trf exited with:")
            tolog("...exitCode=%d" % (exitCode))
            tolog("...exitAcronym=%s" % (exitAcronym))
            tolog("...exitMsg=%s" % (exitMsg))
    def getQueuedata(self, queuename, forceDownload=False, alt=False):
        """ Download the queuedata if not already downloaded """

        # read the queue parameters and create the queuedata.dat file (if the queuename varible is set)
        # pilot home dir is not the workdir. It's the sandbox directory for the pilot.
        # queuedata structure: (with example values)
        # appdir=/ifs0/osg/app/atlas_app/atlas_rel
        # dq2url=http://gk03.swt2.uta.edu:8000/dq2/
        # copytool=gridftp
        # copysetup=
        # ddm=UTA_SWT2
        # se=http://gk03.swt2.uta.edu:8000/dq2/
        # sepath=
        # envsetup=
        # region=US
        # copyprefix=
        # lfcpath=
        # lfchost=
        # sein=
        # wntmpdir=/scratch

        # curl --connect-timeout 20 --max-time 120 -sS "http://pandaserver.cern.ch:25085/cache/schedconfig/BNL_CVMFS_1-condor.pilot.pilot"

        if not os.environ.has_key('PilotHomeDir'):
            os.environ['PilotHomeDir'] = commands.getoutput('pwd')
        hasQueuedata = False

        # try the config servers one by one in case one of them is not responding

        # in case the wrapper has already downloaded the queuedata, it might have a .dat extension
        # otherwise, give it a .json extension if possible
        filename_dat = self.getQueuedataFileName(useExtension='dat', check=False, alt=alt)
        if os.path.exists(filename_dat):
            filename = filename_dat
        else:
            filename = self.getQueuedataFileName(check=False, alt=alt)

        if os.path.exists(filename) and not forceDownload:
            tolog("Queuedata has already been downloaded by pilot wrapper script (will confirm validity)")
            hasQueuedata = self.verifyQueuedata(queuename, filename, 1, 1, "(see batch log for url)")
            if hasQueuedata:
                tolog("Queuedata was successfully downloaded by pilot wrapper script")
            else:
                tolog("Queuedata was not downloaded successfully by pilot wrapper script, will try again")

        if not hasQueuedata:
            # loop over pandaserver round robin _N times until queuedata has been verified, or fail
            ret = -1
            url = 'http://pandaserver.cern.ch'
            if os.environ.has_key('X509_USER_PROXY'):
                sslCert = os.environ['X509_USER_PROXY']
            else:
                sslCert  = '/tmp/x509up_u%s' % str(os.getuid())
            cmd = 'curl --connect-timeout 20 --max-time 120 --cacert %s -sS "%s:25085/cache/schedconfig/%s.pilot.%s" > %s' % \
                  (sslCert, url, queuename, getExtension(alternative='pilot'), filename)
            _N = 3
            for _i in range(_N):
                tolog("Executing command: %s" % (cmd))
                try:
                    # output will be empty since we pipe into a file
                    ret, output = commands.getstatusoutput(cmd)
                except Exception, e:
                    tolog("!!WARNING!!1999!! Failed with curl command: %s" % str(e))
                    return -1, False
                else:
                    if ret == 0:
                        # read back the queuedata to verify its validity
                        hasQueuedata = self.verifyQueuedata(queuename, filename, _i, _N, url)
                        if hasQueuedata:
                            break
                    else:
                        tolog("!!WARNING!!1999!! curl command exited with code %d" % (ret))