Ejemplo n.º 1
0
 def removeRemoteFile(self, full_surl):
     cmd = '%s gfal-rm --verbose  -t %d  %s' % (self._setup, self.timeout,  full_surl)
     tolog("Executing command: %s" % (cmd))
     try:
         ec, rs = commands.getstatusoutput(cmd)
     except Exception, e:
         tolog("Warning: Exception caught in removeFile: %s" % (e))
Ejemplo n.º 2
0
    def updateEventRanges(self, event_ranges):
        """ Update an event range on the Event Server """
        pUtil.tolog("Updating event ranges..")

        message = ""
        #url = "https://aipanda007.cern.ch:25443/server/panda"
        url = "https://pandaserver.cern.ch:25443/server/panda"
        # eventRanges = [{'eventRangeID': '4001396-1800223966-4426028-1-2', 'eventStatus':'running'}, {'eventRangeID': '4001396-1800223966-4426028-2-2','eventStatus':'running'}]

        node={}
        node['eventRanges']=json.dumps(event_ranges)

        # open connection
        ret = pUtil.httpConnect(node, url, path=self.__updateEventRangesDir, mode="UPDATEEVENTRANGES")
        # response = json.loads(ret[1])

        status = ret[0]
        if ret[0]: # non-zero return code
            message = "Failed to update event range - error code = %d, error: " % (ret[0], ret[1])
        else:
            response = json.loads(json.dumps(ret[1]))
            status = int(response['StatusCode'])
            message = json.dumps(response['Returns'])

        return status, message
Ejemplo n.º 3
0
 def finish(self):
     try:
         pUtil.tolog("Tell Event Stager to finish after finishing staging out all events")
         self.__canFinish = True
         self.renewEventStagerStatus()
     except:
         pUtil.tolog("Failed to monitor Event Stager: %s" % traceback.format_exc())
Ejemplo n.º 4
0
    def core_get_data(self, envsetup, token, source_surl, dest_path, experiment):
        """ stage-in core function, can be overridden (see stormSiteMover) """

        error = PilotErrors()

        # determine which timeout option to use
        timeout_option = "--connect-timeout 300 --max-time %d" % (self.timeout)

        sslCert = self.sslCert
        sslKey = self.sslKey
        sslCertDir = self.sslCertDir

        # used curl options:
        # --cert: <cert[:passwd]> Client certificate file and password (SSL)
        # --capath: <directory> CA directory (made using c_rehash) to verify
        # --location: Follow Location: hints (H)
        # --output: <file> Write output to <file> instead of stdout
        # --cilent: Makes Curl mute
        # --show-error: When used with -s it makes curl show error message if it fails
        # Removed for SL6: --ciphers <list of ciphers> (SSL)  Specifies  which  ciphers  to use in the connection.

        """ define curl command string """
        _cmd_str = 'lcg-gt %s https' % (source_surl)
        try:
            s, o = commands.getstatusoutput(_cmd_str)
            tolog("Executing command: %s" % (_cmd_str))
        except Exception, e:
            tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" % (str(e), s, o))
            o = str(e)
Ejemplo n.º 5
0
    def updateEventRange(self, event_range_id, status='finished'):
        """ Update an event range on the Event Server """
        pUtil.tolog("Updating an event range..")

        message = ""
        # url = "https://aipanda007.cern.ch:25443/server/panda"
        url = "https://pandaserver.cern.ch:25443/server/panda"
        node = {}
        node['eventRangeID'] = event_range_id

        # node['cpu'] =  eventRangeList[1]
        # node['wall'] = eventRangeList[2]
        node['eventStatus'] = status
        # tolog("node = %s" % str(node))

        # open connection
        ret = pUtil.httpConnect(node, url, path=self.__updateEventRangesDir, mode="UPDATEEVENTRANGE")
        # response = ret[1]

        if ret[0]: # non-zero return code
            message = "Failed to update event range - error code = %d" % (ret[0])
        else:
            message = ""

        return ret[0], message
Ejemplo n.º 6
0
def findVmPeaks(setup):
    """ Find the VmPeak values """

    vmPeakMax = 0
    vmPeakMean = 0
    RSSMean = 0

#    matched_lines = grep(["Py\:PerfMonSvc\s*INFO\s*VmPeak:\s*[0-9]"], stdout_filename)
#    pattern = "([0-9]+)"

#    # now extract the digits from the found lines
#    N = 0
#    vmPeaks = 0
#    for line in matched_lines:
#        _vmPeak = re.search(pattern, line)
#        if _vmPeak:
#            N += 1
#            vmPeak = _vmPeak.group(1)
#            if vmPeak > vmPeakMax:
#                vmPeakMax = vmPeak
#            vmPeaks += vmPeak

    # use the VmPeak script to get all values
    cmd = "%s python VmPeak.py >Pilot_VmPeak.txt" % (setup)
    try:
        ec, output = timedCommand(cmd, timeout=getProperTimeout(setup))
    except Exception, e:
        tolog("!!WARNING!!1111!! Failed to execute VmPeak script: %s" % (e))
Ejemplo n.º 7
0
    def updateOutputFilesXMLWithSURLs4NG(self, experiment, siteWorkdir, jobId, outputFilesXML):
        """ Update the OutputFiles.xml file with SURLs """

        status = False

        # open and read back the OutputFiles.xml file
        _filename = os.path.join(siteWorkdir, outputFilesXML)
        if os.path.exists(_filename):
            try:
                f = open(_filename, "r")
            except Exception, e:
                tolog("!!WARNING!!1990!! Could not open file %s: %s" % (_filename, e))
            else:
                # get the metadata
                xmlIN = f.read()
                f.close()

                # update the XML
                xmlOUT = updateXMLWithSURLs(experiment, xmlIN, siteWorkdir, jobId, self.__jobrec, format='NG')

                # write the XML
                try:
                    f = open(_filename, "w")
                except OSError, e:
                    tolog("!!WARNING!!1990!! Could not open file %s: %s" % (_filename, e))
                else:
Ejemplo n.º 8
0
def getProcessCommands(euid, pids):
    """ return a list of process commands corresponding to a pid list for user euid """

    _cmd = 'ps u -u %d' % (euid)
    processCommands = []
    ec, rs = commands.getstatusoutput(_cmd)
    if ec != 0:
        pUtil.tolog("Command failed: %s" % (rs))
    else:
        # extract the relevant processes
        pCommands = rs.split('\n')
        first = True
        for pCmd in pCommands:
            if first:
                # get the header info line
                processCommands.append(pCmd)
                first = False
            else:
                # remove extra spaces
                _pCmd = pCmd
                while "  " in _pCmd:
                    _pCmd = _pCmd.replace("  ", " ")
                items = _pCmd.split(" ")
                for pid in pids:
                    # items = username pid ...
                    if items[1] == str(pid):
                        processCommands.append(pCmd)
                        break

    return processCommands
Ejemplo n.º 9
0
    def purgeFiles(path, filename, limit=12*3600):
        """ locate and remove lingering directories/files """

        all_files = glob("%s/%s" % (path, filename))
        max_files = 50
        file_nr = 0

        for _file in all_files:
            if file_nr >= max_files:
                break

            # when was the dir last modified?
            current_time = int(time.time())
            try:
                file_modification_time = os.path.getmtime(_file)
            except:
                # skip this dir since it was not possible to read the modification time
                pass
            else:
                mod_time = current_time - file_modification_time
                if mod_time > limit:
                    tolog("Found file %s last modified %d s ago (will now try to purge it)" % (_file, mod_time))
                    ec, rs = commands.getstatusoutput("rm -f %s" % (_file))
                    if ec != 0:
                        tolog("Failed to remove dir: %s" % (rs))
            file_nr += 1
Ejemplo n.º 10
0
    def remove(self, site, job):
        """ Remove the job state file. Should only be called for
        finished jobs after the last server update. """
        status = True

#        # get the file extension
#        extension = getExtension()
#
#        # do not use self.filename in this case since this function is only
#        # used in pilot.cleanup() where self.filename has not been set
#        fileName = "%s/jobState-%s.%s" % (site.workdir, job.jobId, extension)

        # get the appropriate filename
        fileName = self.getFilename(site.workdir, job.jobId)

        if os.path.isfile(fileName):
            # remove the job state file
            try:
                os.system("rm -f %s" % fileName)
            except OSError:
                tolog("JOBSTATE FAILURE: Failed to remove job state file: %s" % fileName)
                status = False
        else:
            tolog("JOBSTATE FAILURE: Job state file does not exist: %s" % fileName)
            status = False
            
        return status
Ejemplo n.º 11
0
def addFullPathsAsInput(jobPars, full_paths_dictionary):
    """ Replace LFNs with full root paths """
    # jobPars = .. --inputEVNTFile=EVNT.01416937._000003.pool.root,EVNT.01416937._000004.pool.root ..
    # ->
    # jobPars = .. --inputEVNTFile=root://../EVNT.01416937._000003.pool.root,root://../EVNT.01416937._000004.pool.root
    # FORMAT: full_paths_dictionary = { 'LFN1':'protocol://fullpath/LFN1', .. }

    # Extract the inputEVNTFile from the jobPars
    if "--inputEVNTFile" in jobPars:
        found_items = re.findall(r'\S+', jobPars)

        pattern = r"\'?\-\-inputEVNTFile\=(.+)\'?"
        for item in found_items:
            found = re.findall(pattern, item)
            if len(found) > 0:
                input_files = found[0]
                if input_files.endswith("\'"):
                    input_files = input_files[:-1]
                if len(input_files) > 0:
                    for lfn in input_files.split(','):
                        if lfn in full_paths_dictionary.keys():
                            full_path = full_paths_dictionary[lfn]['pfn']
                            if full_path not in jobPars:
                                jobPars = jobPars.replace(lfn, full_path)
                        else:
                            tolog("!!WARNING!!3435!! Did not find LFN=%s" % lfn)
                else:
                    tolog(
                        "!!WARNING!!3434!! Zero length list, cannot update LFN:s with full paths (remote I/O will not work)")

    return jobPars
Ejemplo n.º 12
0
    def __init__(self, setup_path, *args, **kwrds):
        self._setup = setup_path
        self.copyCommand = 'aria2c'
        self.commandInPATH()
	rucio_account=self.rucio_account
	tolog("Rucio account: %s" %(rucio_account))
	if rucio_account == "":
		tolog("!!FAILED!!2999!! Rucio account not set!")
		raise Exception("!!FAILED!!2999!! Rucio account not set!")
	cmd="curl -1 -i -H \"X-Rucio-Account: $RUCIO_ACCOUNT\" --cacert %s --cert %s --key %s --capath %s -X GET https://rucio-auth-prod.cern.ch/auth/x509_proxy| grep 'X-Rucio-Auth-Token:'"%(self.sslKey,self.sslKey,self.sslKey,self.sslCertDir)
        tolog("Command to be launched: %s" %(cmd))
        token_rucio_cmd=Popen(cmd,stdout=PIPE,stderr=PIPE, shell=True)
        token_rucio, stderr= token_rucio_cmd.communicate()
	if token_rucio:
	   if '\r' in token_rucio:
	        pos2print=token_rucio.find('\r')
                token_rucio=token_rucio[:pos2print]
	   elif '\n' in token_rucio:
	        pos2print=token_rucio.find('\n')
	   pos2print=token_rucio.find("CN")
           token_rucio2print=token_rucio[:pos2print]+'(Hidden token)'
           tolog("Token on file: %s" %(token_rucio2print))

	   if os.path.exists('token_file'):
    		os.remove('token_file')
	   try:
			token_file=open('token_file', 'w')
	   except IOError, e:
            tolog ("!!WARNING!! Failed to create file: %s"%(e))
            raise Exception("!!FAILED!!1099!! Cannot create file for registering token!")
	   else:
			token_file.write(token_rucio)
Ejemplo n.º 13
0
 def addMD5sum(self, lfn, md5sum):
     """ add md5sum to lfn """
     if os.environ.has_key('LD_LIBRARY_PATH'):
         tolog("LD_LIBRARY_PATH prior to lfc import: %s" % os.environ['LD_LIBRARY_PATH'])
     else:
         tolog("!!WARNING!!2999!! LD_LIBRARY_PATH not set prior to lfc import")
     import lfc
     os.environ['LFC_HOST'] = readpar('lfchost')
     #    b="."
     #    buffer = b.zfill(200)
     #    ret = lfc.lfc_seterrbuf(buffer, len(buffer))
     stat = lfc.lfc_filestatg()
     exitcode = lfc.lfc_statg(lfn, "", stat)
     if exitcode != 0:
         #    print "error:",buffer
         err_num = lfc.cvar.serrno
         tolog("!!WARNING!!2999!! lfc.lfc_statg: %d %s" % (err_num, lfn))
         return exitcode
     exitcode = lfc.lfc_setfsizeg(stat.guid, stat.filesize, 'MD', md5sum)
     if exitcode != 0:
         #    print "error:",buffer
         err_num = lfc.cvar.serrno
         tolog("[Non-fatal] ERROR: lfc.lfc_setfsizeg: %d %s %s" % (err_num, lfn, md5sum))
         return exitcode
     tolog("Successfully set md5sum for %s" % (lfn))
     return exitcode
Ejemplo n.º 14
0
    def downloadAllQueuenames(self):
        """ Download the entire schedconfig from AGIS """

        ec = 0

        # Do not even bother to download anything if JSON is not supported
        try:
            from json import load
        except:
            tolog("!!WARNING!!1231!! JSON is not available, cannot download schedconfig dump")
            ec = -1
        else:
            # url = "http://atlas-agis-api-dev.cern.ch/request/pandaqueue/query/list/?json"
            url = "http://atlas-agis-api.cern.ch/request/pandaqueue/query/list/?json&preset=schedconf.all&tier_level=1&type=production"
            schedconfig_dump = self.getAllQueuedataFilename()
            cmd = "curl \'%s\' >%s" % (url, schedconfig_dump)

            if os.path.exists(schedconfig_dump):
                tolog("File %s already downloaded" % (schedconfig_dump))
            else:
                tolog("Executing command: %s" % (cmd))
                ec, out = commands.getstatusoutput(cmd)
                if ec != 0:
                    tolog("!!WARNING!!1234!! Failed to download %s: %d, %s" % (schedconfig_dump, ec, out))
                else:
                    tolog("Downloaded schedconfig dump")

        return ec
Ejemplo n.º 15
0
    def getSpecialAppdir(self, value):
        """ Get a special appdir depending on whether env variable 'value' exists """

        ec = 0
        _appdir = ""

        # does the directory exist?
        if os.environ.has_key(value):
            # expand the value in case it contains further environmental variables
            _appdir = os.path.expandvars(os.environ[value])
            tolog("Environment has variable $%s = %s" % (value, _appdir))
            if _appdir == "":
                tolog("!!WARNING!!2999!! Environmental variable not set: %s" % (value))
                ec = self.__error.ERR_SETUPFAILURE
            else:
                # store the evaluated symbol in appdir
                if self.replaceQueuedataField('appdir', _appdir, verbose=False):
                    tolog("Updated field %s in queuedata: %s" % ('appdir', _appdir))
                else:
                    tolog("!!WARNING!!2222!! Queuedata field could not be updated, cannot continue")
                    ec = self.__error.ERR_SETUPFAILURE
        else:
            tolog("!!WARNING!!2220!! Environmental variable %s is not defined" % (value))

        return ec, _appdir
Ejemplo n.º 16
0
    def stageInFile(self, source, destination, sourceSize, sourceChecksum, guid=None):
        """StageIn the file. should be implementated by different site mover."""
        statusRet = 0
        outputRet = {}
        outputRet["errorLog"] = None
        outputRet["report"] = {}
        outputRet["report"]["clientState"] = None

        # build the parameters
        _params = ""
        if sourceSize != 0 and sourceSize != "0":
            _params += self.__par_filesize % (sourceSize)
        if sourceChecksum and sourceChecksum != 'None' and sourceChecksum != 0 and sourceChecksum != "0" and not self.isDummyChecksum(sourceChecksum):
            csumtype = self.getChecksumType(sourceChecksum)
            # special case for md5sum (command only understands 'md5' and 'adler32', and not 'ad' and 'md5sum')
            if csumtype == 'md5sum':
                csumtype = 'md5'
            _params += self.__par_checksum % ("%s:%s" % (csumtype, sourceChecksum),)
        # add the guid option
        _params += " --guid %s" % (guid)

        self.log("StageIn files started.")
        _cmd_str = self.__localget % (self._setup, _params, source, destination)
        self.log('Executing command: %s' % (_cmd_str))
        s = -1
        o = '(not defined)'
        t0 = os.times()
        outputRet["report"]['relativeStart'] = time()
        outputRet["report"]['transferStart'] = time()
        try:
            timerCommand = TimerCommand(_cmd_str)
            s, o = timerCommand.run(timeout=self.timeout)
        except Exception, e:
            tolog("!!WARNING!!2990!! Exception caught by stageInFile(): %s" % (str(e)))
            o = str(e)
Ejemplo n.º 17
0
    def getTier1Queue2(self, cloud):
        """ Download the queuedata for the Tier-1 in the corresponding cloud and get the queue name """

        queuename = ""

        path = self.getTier1InfoFilename()
        ec = self.downloadTier1Info()
        if ec == 0:
            # Process the downloaded T-1 info
            f = open(path, 'r')
            if getExtension() == "json":
                from json import loads
                data = loads(f.read())
            else:
                from pickle import load
                data = load(f)
            f.close()

            # Extract the relevant queue info for the given cloud
            T1_info = [x for x in data if x['cloud']==cloud]

            # finally get the queue name
            if T1_info != []:
                info = T1_info[0]
                if info.has_key('PanDAQueue'):
                    queuename = info['PanDAQueue']
                else:
                    tolog("!!WARNING!!1222!! Returned Tier-1 info object does not have key PanDAQueue: %s" % str(info))
            else:
                tolog("!!WARNING!!1223!! Found no Tier-1 info for cloud %s" % (cloud))

        return queuename
Ejemplo n.º 18
0
        def sig2exc(sig, frm):
            """ signal handler """

            error = PilotErrors()
            global failureCode, globalPilotErrorDiag, globalErrorCode
            globalPilotErrorDiag = "!!FAILED!!3000!! SIGTERM Signal %s is caught in child pid=%d!\n" % (sig, os.getpid())
            tolog(globalPilotErrorDiag)
            if sig == signal.SIGTERM:
                globalErrorCode = error.ERR_SIGTERM
            elif sig == signal.SIGQUIT:
                globalErrorCode = error.ERR_SIGQUIT
            elif sig == signal.SIGSEGV:
                globalErrorCode = error.ERR_SIGSEGV
            elif sig == signal.SIGXCPU:
                globalErrorCode = error.ERR_SIGXCPU
            elif sig == signal.SIGBUS:
                globalErrorCode = error.ERR_SIGBUS
            elif sig == signal.SIGUSR1:
                globalErrorCode = error.ERR_SIGUSR1
            else:
                globalErrorCode = error.ERR_KILLSIGNAL
            failureCode = globalErrorCode
            # print to stderr
            print >> sys.stderr, globalPilotErrorDiag
            raise SystemError(sig)
Ejemplo n.º 19
0
    def fixStageInPath(self, path):
        """Fix the path"""

        if path[:3] == "srm" and '?SFN=' in path:
            self.log("Found SFN part in file path: %s" % (path))
        elif path[:3] == "srm":
            try:
                hostname = path.split('/',3)[2]
            except Exception as e:
                self.log("'!!WARNING!!2999!! Could not extract srm protocol for replacement, keeping path variable as it is: %s (%s)' %\
                      (path, str(e))")
            else:
                # srm = 'srm://head01.aglt2.org'
                srm = 'srm://' + hostname

                # does seopt contain any matching srm's?
                sematch = self.getSEMatchFromSEOpt(srm)
                if sematch != "":
                    path = path.replace(srm, sematch)
                    self.log("Replaced %s with %s (from seopt) in path: %s" % (srm, sematch, path))
                else:
                     se = readpar('se').split(",")[0]
                     _dummytoken, se = self.extractSE(se)
                     tolog("Using SE: %s" % (se))

                     path = path.replace(srm, se)
                     self.log("Replaced %s with %s (from se) in path: %s" % (srm, se, path))

                # add port number from se to getfile if necessary
                path = self.addPortToPath(se, path)
        return path
Ejemplo n.º 20
0
def getOutFilesGuids(outFiles, workdir):
    """ get the outFilesGuids from the PFC """

    ec = 0
    pilotErrorDiag = ""
    outFilesGuids = []
    pfcFile = "%s/PoolFileCatalog.xml" % (workdir)

    # initialization: make sure the guid list has the same length as the file list
    for i in range (0, len(outFiles)):
        outFilesGuids.append(None)

    # make sure the PFC exists
    if os.path.isfile(pfcFile):
        from xml.dom import minidom
        xmldoc = minidom.parse(pfcFile)
        fileList = xmldoc.getElementsByTagName("File")
        for thisfile in fileList:
            gpfn = str(thisfile.getElementsByTagName("pfn")[0].getAttribute("name"))
            guid = str(thisfile.getAttribute("ID"))
            for i in range(0,len(outFiles)):
                if outFiles[i] == gpfn:
                    outFilesGuids[i] = guid
    else:
        pilotErrorDiag = "PFC file does not exist: %s" % (pfcFile)
        tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag))
        error = PilotErrors()
        ec = error.ERR_MISSINGPFC

    return ec, pilotErrorDiag, outFilesGuids
Ejemplo n.º 21
0
def stageIn(job, jobSite, analJob, pilot_initdir, pworkdir):
    """ perform the stage-in """

    ec = 0
    statusPFCTurl = None
    usedFAXandDirectIO = False

    # prepare the input files (remove non-valid names) if there are any
    ins, job.filesizeIn, job.checksumIn = RunJobUtilities.prepareInFiles(job.inFiles, job.filesizeIn, job.checksumIn)
    if ins:
        tolog("Preparing for get command")

        # get the file access info (only useCT is needed here)
        useCT, oldPrefix, newPrefix, useFileStager, directIn = getFileAccessInfo()

        # transfer input files
        tin_0 = os.times()
        ec, job.pilotErrorDiag, statusPFCTurl, job.filesWithoutFAX, job.filesWithFAX, usedFAXandDirectIO = \
            mover.get_data(job, jobSite, ins, stageinretry, analysisJob=analJob, usect=useCT,\
                           pinitdir=pilot_initdir, proxycheck=False, inputDir=inputDir, workDir=pworkdir)
        if ec != 0:
            job.result[2] = ec
        tin_1 = os.times()
        job.timeStageIn = int(round(tin_1[4] - tin_0[4]))

    return job, ins, statusPFCTurl, usedFAXandDirectIO
Ejemplo n.º 22
0
    def getValidBaseURLs(self, order=None):
        """ Return list of valid base URLs """
        # if order is defined, return given item first
        # e.g. order=http://atlpan.web.cern.ch/atlpan -> ['http://atlpan.web.cern.ch/atlpan', ...]

        validBaseURLs = []
        _validBaseURLs = ["http://www.usatlas.bnl.gov",\
                          "https://www.usatlas.bnl.gov",\
                          "http://pandaserver.cern.ch",\
                          "http://atlpan.web.cern.ch/atlpan",\
                          "https://atlpan.web.cern.ch/atlpan",\
                          "http://common-analysis-framework.cern.ch",\
                          "http://classis01.roma1.infn.it",\
                          "http://atlas-install.roma1.infn.it",\
                          "http://homepages.physik.uni-muenchen.de/~Johannes.Ebke"]

        if order:
            validBaseURLs.append(order)
            for url in _validBaseURLs:
                if url != order:
                    validBaseURLs.append(url)
        else:
            validBaseURLs = _validBaseURLs

        tolog("getValidBaseURLs will return: %s" % str(validBaseURLs))
        return validBaseURLs
Ejemplo n.º 23
0
    def removeRedundantFiles(self, workdir):
        """ Remove redundant files and directories """

        # List of files and directories to be removed from work directory prior to log file creation
        # Make sure that any large files or directories that are not wanted in the log file are included in this list
        dir_list = [
                    "buildJob*",
                    "external",
                    "fort.*",
                    "home",
                    "python",
                    "share",
                    "workdir",
                    "*.py",
                    "*.pyc",
                    "*.root*",
                    "JEM",
                    "tmp*",
                    "*.tmp",
                    "*.TMP",
                    "scratch",
                    ]
    
        for _dir in dir_list: 
            files = glob(os.path.join(workdir, _dir))
            rc = remove(files)
            if not rc:
                tolog("IGNORE: Failed to remove redundant file(s): %s" % (files))
Ejemplo n.º 24
0
 def getRemoteFileInfo(self, file):
     try:
         size, md5 = self.s3Objectstore.getRemoteFileInfo(file)
     except:
         tolog("Failed to get remote file information: %s" % (sys.exc_info()[1]))
         return None, None
     return size, md5
Ejemplo n.º 25
0
    def getSecurityKey(self, privateKeyName, publicKeyName):
        """ Return the key pair """
        keyName=privateKeyName + "_" + publicKeyName
        if keyName in self.__securityKeys.keys():
            return self.__securityKeys[keyName]
        else:
            try:
                #import environment
                #env = environment.set_environment()

                sslCert = self.getSSLCertificate()
                sslKey = sslCert

                node={}
                node['privateKeyName'] = privateKeyName
                node['publicKeyName'] = publicKeyName
                #host = '%s:%s' % (env['pshttpurl'], str(env['psport'])) # The key pair is not set on other panda server
                host = 'aipanda007.cern.ch:25443'
                path = '/server/panda/getKeyPair'
                conn = httplib.HTTPSConnection(host,key_file=sslKey, cert_file=sslCert)
                conn.request('POST',path,urllib.urlencode(node))
                resp = conn.getresponse()
                data = resp.read()
                conn.close()
                dic = cgi.parse_qs(data)
                if dic["StatusCode"][0] == "0":
                    self.__securityKeys[keyName] = {"publicKey": dic["publicKey"][0], "privateKey": dic["privateKey"][0]}
                    return self.__securityKeys[keyName]
            except:
                _type, value, traceBack = sys.exc_info()
                tolog("Failed to getKeyPair for (%s, %s)" % (privateKeyName, publicKeyName))
                tolog("ERROR: %s %s" % (_type, value))
                
        return {"publicKey": None, "privateKey": None}
Ejemplo n.º 26
0
    def dumpValue(self, name, value):
        """ Print a value if not empty """

        if value != "":
            tolog("%s = %s" % (name, value))
        else:
            tolog("%s was not set by the batch system" % (name))
Ejemplo n.º 27
0
    def addToJobMetrics(self, jobResult, path, jobId):
        """ Add the batch job and machine features to the job metrics """

        jobMetrics = ""

#        jobMetrics += self.addFieldToJobMetrics("hs06", self.hs06)
#        jobMetrics += self.addFieldToJobMetrics("shutdowntime", self.shutdownTime)
#        jobMetrics += self.addFieldToJobMetrics("jobslots", self.jobSlots)
#        jobMetrics += self.addFieldToJobMetrics("phys_cores", self.physCores)
#        jobMetrics += self.addFieldToJobMetrics("log_cores", self.logCores)
#        jobMetrics += self.addFieldToJobMetrics("cpufactor_lrms", self.cpuFactorLrms)
#        jobMetrics += self.addFieldToJobMetrics("cpu_limit_secs_lrms", self.cpuLimitSecsLrms)
#        jobMetrics += self.addFieldToJobMetrics("cpu_limit_secs", self.cpuLimitSecs)
#        jobMetrics += self.addFieldToJobMetrics("wall_limit_secs_lrms", self.wallLimitSecsLrms)
#        jobMetrics += self.addFieldToJobMetrics("wall_limit_secs", self.wallLimitSecs)
#        jobMetrics += self.addFieldToJobMetrics("disk_limit_GB", self.diskLimitGB)
#        jobMetrics += self.addFieldToJobMetrics("jobstart_secs", self.jobStartSecs)
#        jobMetrics += self.addFieldToJobMetrics("mem_limit_MB", self.memLimitMB)
#        jobMetrics += self.addFieldToJobMetrics("allocated_CPU", self.allocatedCPU)

        # Get the max disk space used by the payload (at the end of a job)
        if jobResult == "finished" or jobResult == "failed" or jobResult == "holding":
            max_space = getMaxWorkDirSize(path, jobId)
            if max_space > 0L:
                jobMetrics += self.addFieldToJobMetrics("workDirSize", max_space)
            else:
                tolog("Will not add max space = %d to job metrics" % (max_space))

        return jobMetrics
Ejemplo n.º 28
0
    def checkSpecialEnvVars(self, sitename):
        """ Check special environment variables """

        ec = 0
        tolog("No special env var checks for site %s" % (sitename))

        return ec
Ejemplo n.º 29
0
    def deserialize(self, text_string):
        tolog("Job received: %s", text_string)
        try:
            self.__dict__ = deserialize(text_string)
        except:
            tolog(" received exception while converting json string to BalsamJob: " + str(sys.exc_info()[1]))
            raise

        self.executable = convert_unicode_string(self.executable)
        self.executable_args = convert_unicode_string(self.executable_args)
        self.input_url = convert_unicode_string(self.input_url)
        tmp_input_files = []
        for i_file in self.input_files:
            tmp_input_files.append(convert_unicode_string(i_file))
        self.input_files = tmp_input_files
        self.output_url = convert_unicode_string(self.output_url)
        tmp_output_files = []
        for o_file in self.output_files:
            tmp_output_files.append(convert_unicode_string(o_file))
        self.output_files = tmp_output_files
        self.preprocess = convert_unicode_string(self.preprocess)
        self.preprocess_args = convert_unicode_string(self.preprocess_args)
        self.postprocess = convert_unicode_string(self.postprocess)
        self.postprocess_args = convert_unicode_string(self.postprocess_args)
        self.scheduler_args = convert_unicode_string(self.scheduler_args)
        self.condor_job_file = convert_unicode_string(self.condor_job_file)
        self.condor_dagman_file = convert_unicode_string(self.condor_dagman_file)
        self.target_site = convert_unicode_string(self.target_site)
Ejemplo n.º 30
0
    def getSubprocessName(self, eventService):
        """ Select which subprocess is to be run by the Monitor """

        # The default subprocess is RunJob (name='Normal', which performs payload setup, stage-in, payload execution and stage-out).
        # An alternative subprocess is the runEvent module which downloads events from an Event Server, executes a payload
        # and stages ou output files asynchronously as they are ready.
        # Note: send the entire job object to this method since there might be other subprocesses created at a later time which
        # will be identified by this method using some other job data member

        # Default subprocess name
        name = "RunJob"

        # Select alternative subprocess names for HPCs
        isHPC, _name = extractHPCInfo(readpar('catchall'))
        if isHPC:
            name = "RunJob" + _name # e.g. "RunJobTitan" is the proper subprocess name for the Titan plug-in

        # for es merge jobs
        if _name == "Hpc":
            name = "RunJob"

        # Are we going to run an event service job?
        if eventService:
            tolog("Encountered an event service job")
            if isHPC:
                name = "RunJob%sEvent" % (_name)
            else:
                name = "RunJobEvent"

        tolog("Selected subprocess: %s" % (name))

        return name
Ejemplo n.º 31
0
                        if len(ls) == 1:
                            if "workDir" in ls:
                                ec, rs = commands.getstatusoutput("ls -lF %s" %
                                                                  (_dir))
                                tolog("ls: %s" % str(rs))
                                tolog(
                                    "Found single workDir: %s (will now purge it)"
                                    % (_dir))
                                ec, rs = commands.getstatusoutput("rm -rf %s" %
                                                                  (_dir))
                                if ec != 0:
                                    tolog("Failed to remove dir: %s" % (rs))
                                else:
                                    purged_nr += 1
            dir_nr += 1
        tolog("Purged %d single workDirs directories" % (purged_nr))

    purgeWorkDirs = staticmethod(purgeWorkDirs)

    def purgeFiles(path, filename, limit=12 * 3600):
        """ locate and remove lingering directories/files """

        all_files = glob("%s/%s" % (path, filename))
        max_files = 50
        file_nr = 0

        for _file in all_files:
            if file_nr >= max_files:
                break

            # when was the dir last modified?
Ejemplo n.º 32
0
    def getFileTransferInfo(self, transferType, buildJob):
        """ Get all relevant fields related to file transfer """

        copysetup = readpar('copysetupin')

        # create the direct access dictionary
        fileTransferInfo = getDirectAccessDic(copysetup)

        # if copysetupin did not contain direct access info, try the copysetup instead
        if not fileTransferInfo:
            copysetup = readpar('copysetup')
            fileTransferInfo = getDirectAccessDic(copysetup)

        # should the copytool be used?
        useCopyTool = False
        useFileStager = False
        useDirectAccess = False
        lfcHost = readpar('lfchost')
        oldPrefix = ""
        newPrefix = ""
        dInfo = None
        if fileTransferInfo:
            dInfo = True
            # no direct access / remote I/O, use standard copytool (copy-to-scratch)
            if fileTransferInfo['useCopyTool']:
                useCopyTool = True
            # do not set the LFC host for file stager
            if fileTransferInfo['useFileStager']:
                useFileStager = True
            if fileTransferInfo['directIn']:
                useDirectAccess = True

            oldPrefix = fileTransferInfo['oldPrefix']
            newPrefix = fileTransferInfo['newPrefix']

        # override settings for transferType direct
        if transferType == 'direct':
            useCopyTool = False
            useFileStager = False
            useDirectAccess = True
            if oldPrefix == "" and newPrefix == "":
                lfcHost = ""

        # should pilot create TURL based PFC? (not done here, but setup needs to be aware of it)
        # if dInfo and useDirectAccess and oldPrefix == "" and newPrefix == "":
        if (transferType == 'direct' or
            (useFileStager and useDirectAccess)) and (
                oldPrefix == "" and newPrefix == "") and not buildJob:
            #        if (transferType == 'direct' or (not useFileStager and useDirectAccess)) and (oldPrefix == "" and newPrefix == ""):
            usePFCTurl = True
        else:
            usePFCTurl = False

        # force usePFCTurl for all jobs
        if not buildJob and useDirectAccess:
            tolog("Forced usePFCTurl (reset old/newPrefix)")
            usePFCTurl = True
            oldPrefix = ""
            newPrefix = ""

        return dInfo, useCopyTool, useDirectAccess, useFileStager, oldPrefix, newPrefix, copysetup, usePFCTurl, lfcHost
Ejemplo n.º 33
0
class Experiment(object):

    #    experiment = "generic"               # String defining the experiment

    # private data members
    __experiment = "generic"  # String defining the experiment
    __instance = None  # Boolean used by subclasses to become a Singleton
    __error = PilotErrors()  # PilotErrors object
    __doFileLookups = False  # True for LFC based file lookups (basically a dummy data member here since singleton object is static)
    __cache = ""  # Cache URL used e.g. by LSST

    # Required methods

    def __init__(self, *args, **kwargs):
        """ Default initialization """

        # e.g. self.__errorLabel = errorLabel
        #        self.experiment = kwargs.get('experiment')
        pass

    def getExperiment(self):
        """ Return a string with the experiment name """

        #        return self.experiment
        return self.__experiment

    def getJobExecutionCommand(self):
        """ Define and test the command(s) that will be used to execute the payload """
        # E.g. cmd = "source <path>/setup.sh; <path>/python <script>"

        cmd = ""

        return cmd

    def getFileLookups(self):
        """ Return the file lookup boolean """

        return self.__doFileLookups

    def doFileLookups(self, doFileLookups):
        """ Update the file lookups boolean """

        # Only implement this method if class really wants to update the __doFileLookups boolean
        # ATLAS wants to implement this, but not CMS
        # Method is used by Mover
        # self.__doFileLookups = doFileLookups
        pass

    def willDoAlternativeFileLookups(self):
        """ Should file lookups be done using alternative methods? """

        # E.g. in the migration period where LFC lookups are halted in favour of other methods in the Rucio API
        # (for ATLAS), this method could be useful. See the usage in Mover::getReplicaDictionary() which is called
        # after Experiment::willDoFileLookups() defined above. The motivation is that direct LFC calls are not to be
        # used any longer by the pilot, and in the migration period the actual LFC calls will be done in the Rucio
        # API. Eventually this API will switch to alternative file lookups.

        return False

    def willDoFileLookups(self):
        """ Should (LFC) file lookups be done by the pilot or not? """

        return self.__doFileLookups

    def willDoFileRegistration(self):
        """ Should (LFC) file registration be done by the pilot or not? """

        return False

    def getFileCatalog(self):
        """ Return the default file catalog to use (e.g. for replica lookups) """
        # See usage in Mover.py

        # e.g. 'lfc://prod-lfc-atlas.cern.ch:/grid/atlas'
        return ""

    # Additional optional methods
    # These methods are optional and can be left as they are here, or modified according to special needs

    def verifyProxy(self, envsetup="", limit=None):
        """ Check for a valid voms/grid proxy longer than N hours """
        # Use 'limit' to set required length

        tolog("(verifyProxy() is not implemented)")

        exitcode = 0
        pilotErrorDiag = ""

        return exitcode, pilotErrorDiag

    def removeRedundantFiles(self, workdir):
        """ Remove redundant files and directories """

        # List of files and directories to be removed from work directory prior to log file creation
        # Make sure that any large files or directories that are not wanted in the log file are included in this list
        dir_list = [
            "buildJob*",
            "external",
            "fort.*",
            "home",
            "python",
            "share",
            "workdir",
            "*.py",
            "*.pyc",
            "*.root*",
            "JEM",
            "tmp*",
            "*.tmp",
            "*.TMP",
            "scratch",
        ]

        for _dir in dir_list:
            files = glob(os.path.join(workdir, _dir))
            rc = remove(files)
            if not rc:
                tolog("IGNORE: Failed to remove redundant file(s): %s" %
                      (files))

    def getPayloadName(self, job):
        """ Set a suitable name for the payload stdout """

        # The payload <name> gets translated into <name>_stdout.txt
        # which is the name of the stdout file produced by the payload execution
        # (essentially commands.getoutput("<setup>; <payload executable> [options] > <name>_stdout.txt"))

        # The job object can be used to create more precise stdout names (see e.g. the ATLASExperiment implementation)

        return "payload"

    def isOutOfMemory(self, **kwargs):
        """ Try to identify out of memory errors in the stderr/out """

        return False

    def getNumberOfEvents(self, **kwargs):
        """ Return the number of events """

        return 0

    def specialChecks(self, **kwargs):
        """ Implement special checks here """

        # Return False if fatal failure, otherwise return True
        # The pilot will abort if this method returns a False

        # On an HPC system, it might be good to skip certain checks (e.g. CVMFS, LFC, etc). Refer to schedconfig.resourcetype, set to 'hpc' on an HPC queue

        status = False

        tolog("No special checks for \'%s\'" % (self.experiment))

        return True  # obviously change this to 'status' once implemented

    def checkSpecialEnvVars(self, sitename):
        """ Check special environment variables """

        ec = 0
        tolog("No special env var checks for site %s" % (sitename))

        return ec

    def setINDS(self, realDatasetsIn):
        """ Extract the dataset as set by pathena option --inDS and set the INDS environmental variable """
        # Needed by pathena (move to ATLASExperiment later)

        inDS = ""
        for ds in realDatasetsIn:
            if "DBRelease" not in ds and ".lib." not in ds:
                inDS = ds
                break
        if inDS != "":
            tolog("Setting INDS env variable to: %s" % (inDS))
            os.environ['INDS'] = inDS
        else:
            tolog("INDS unknown")

    def getValidBaseURLs(self, order=None):
        """ Return list of valid base URLs """
        # if order is defined, return given item first
        # e.g. order=http://atlpan.web.cern.ch/atlpan -> ['http://atlpan.web.cern.ch/atlpan', ...]

        validBaseURLs = []
        _validBaseURLs = ["http://www.usatlas.bnl.gov",\
                          "https://www.usatlas.bnl.gov",\
                          "http://pandaserver.cern.ch",\
                          "http://atlpan.web.cern.ch/atlpan",\
                          "https://atlpan.web.cern.ch/atlpan",\
                          "http://common-analysis-framework.cern.ch",\
                          "http://classis01.roma1.infn.it",\
                          "http://atlas-install.roma1.infn.it",\
                          "http://homepages.physik.uni-muenchen.de/~Johannes.Ebke"]

        if order:
            validBaseURLs.append(order)
            for url in _validBaseURLs:
                if url != order:
                    validBaseURLs.append(url)
        else:
            validBaseURLs = _validBaseURLs

        tolog("getValidBaseURLs will return: %s" % str(validBaseURLs))
        return validBaseURLs

    def downloadTrf(self, wgetCommand, jobTrf):
        """ Download the trf """

        status = False
        pilotErrorDiag = ""
        cmd = "%s %s" % (wgetCommand, jobTrf)
        trial = 1
        max_trials = 3

        # try to download the trf a maximum of 3 times
        while trial <= max_trials:
            tolog("Executing command [Trial %d/%d]: %s" %
                  (trial, max_trials, cmd))
            ec, rets = commands.getstatusoutput(cmd)
            if not rets:
                rets = "(None)"
            if ec != 0:
                # Analyze exit code / output
                from futil import check_syserr
                check_syserr(ec, rets)
                pilotErrorDiag = "wget command failed: %d, %s" % (ec, rets)
                tolog("!!WARNING!!3000!! %s" % (pilotErrorDiag))
                if trial == max_trials:
                    tolog("!!FAILED!!3000!! Could not download trf: %s" %
                          (rets))
                    status = False
                    break
                else:
                    tolog("Will try again after 60s..")
                    from time import sleep
                    sleep(60)
            else:
                pilotErrorDiag = ""
                tolog("wget command returned: %s" % (rets))
                status = True
                break
            trial += 1

        return status, pilotErrorDiag

    def getAnalysisTrf(self, wgetCommand, origTRF, pilot_initdir):
        """ Get the trf to be used for analysis jobs """

        pilotErrorDiag = ""
        trfName = origTRF.split('/')[-1]
        tolog("trfName = %s" % (trfName))
        origBaseURL = ""

        # Copy trf from pilot init dir if distributed with pilot code
        fname = os.path.join(pilot_initdir, trfName)
        status = False
        if os.path.exists(fname):
            from shutil import copy2
            try:
                copy2(fname, os.getcwd())
            except Exception, e:
                tolog(
                    "!!WARNING!!2999!! Could not copy trf from pilot init dir: %s"
                    % str(e))
            else:
                tolog("Copied trf (%s) from pilot init dir" % (fname))
                status = True

        # Download trf
        if not status:
            # verify the base URL
            for baseURL in self.getValidBaseURLs():
                if origTRF.startswith(baseURL):
                    origBaseURL = baseURL
                    break

            if origBaseURL == "":
                pilotErrorDiag = "Invalid base URL: %s" % (origTRF)
                return self.__error.ERR_TRFDOWNLOAD, pilotErrorDiag, ""
            else:
                tolog("Verified the trf base URL: %s" % (origBaseURL))

            # try to download from the required location, if not - switch to backup
            for baseURL in self.getValidBaseURLs(order=origBaseURL):
                trf = re.sub(origBaseURL, baseURL, origTRF)
                tolog("Attempting to download trf: %s" % (trf))
                status, pilotErrorDiag = self.downloadTrf(wgetCommand, trf)
                if status:
                    break

            if not status:
                return self.__error.ERR_TRFDOWNLOAD, pilotErrorDiag, ""

            tolog("Successfully downloaded trf")

        tolog("Changing permission of %s to 0755" % (trfName))
        try:
            os.chmod(trfName, 0755)
        except Exception, e:
            pilotErrorDiag = "Failed to chmod %s: %s" % (trfName, str(e))
            return self.__error.ERR_CHMODTRF, pilotErrorDiag, ""
Ejemplo n.º 34
0
    # Optional
    def getSubprocess(self, cmd, stdout=None, stderr=None):
        """ Execute and return a subprocess """

        process = None
        try:
            tolog("Executing command: %s" % (cmd))
            if stdout and stderr:
                # use stdout/stdout file objects to redirect the stdout/stderr streams
                process = Popen(cmd, shell=True, stdout=stdout, stderr=stderr)
            else:
                process = Popen(cmd, shell=True)
        except Exception, e:
            tolog("!!WARNING!!2344!! Caught exception: %s" % (e))
        else:
            tolog("Subprocess is running")

        return process

    # Optional
    def getJobExecutionCommand4EventService(self):
        """ Define and test the command(s) that will be used to execute the payload for the event service """
        # E.g. cmd = ["source <path>/setup.sh; <path>/python <script>"]
        # The command returned from this method is executed using subprocess.Popen() from the runEvent module

        # Note: this optional method only need to be defined in case the event service is to be used
        # As of March 2014, this is not yet functional or documented.

        # The actual command must be declared as a list since that is expected by Popen()
        cmd = [""]
Ejemplo n.º 35
0
class Cleaner:
    """
    This class is used to clean up lingering old/lost jobs.
    The clean-up criteria is that for a found Panda job directory,
    if the pilotlog.txt has not been updated for at least <limit>
    hours, and if the job state is 'running' then the assumption is
    that the job was unexpectedly terminated and should be erased from disk.
    The class defines the clean-up limit, but overrides this value if set
    in schedconfig.
    The cleanup() method should be executed after queuedata has been downloaded
    and after job recovery (which might or might not be turned on).

    Usage:
           from Cleaner import Cleaner
           cleaner = Cleaner(limit=<limit>, path=<path>, uflag=<uflag>)
           ec = cleaner.cleanup()

    cleanup() will return True for a successful/performed cleanup, False otherwise.

    <path> should normally be thisSite.wntmpdir
    <limit> should be an integer > 0 [hours]
    <uflag> user flag needed to distinguish job type (an analysis pilot is not allowed
            to touch production job directories on some sites)
    """
    def __init__(self, limit=12, path="/tmp", uflag=None):
        """ Default init with verification """

        self.clean = True
        self.uflag = None
        # verify the clean-up limit
        _type = str(limit.__class__)
        if limit and _type.find('int') == -1:
            tolog("Trying to convert limit from type %s to int" % (_type))
            try:
                limit = int(limit)
            except:
                tolog("Failed to convert, reset to default")
                limit = 12
        if limit == 0:
            tolog("Clean-up limit set to zero (no clean-up will be done)")
            self.clean = False
        elif limit < 0 or not limit:
            limit = 12
            tolog(
                "!!WARNING!!5500!! Clean-up limit out of bounds, reset to default: %d"
                % (limit))

        self.limit = limit
        tolog("Cleaner initialized with clean-up limit: %d hours" %
              (self.limit))

        # verify the clean-up path and set the uflag if necessary
        if self.clean:
            if not path:
                path = "/tmp"
                tolog("Requested path reset to default: %s" % (path))
            if os.path.exists(path):
                self.path = path
                tolog(
                    "Cleaner will scan for lost directories in verified path: %s"
                    % (self.path))

                if uflag:
                    self.uflag = uflag
            else:
                tolog(
                    "!!WARNING!!5500!! No such directory: %s (clean-up not possible)"
                    % (path))
                self.path = None
                self.clean = False

    def cleanup(self):
        """ execute the clean-up """

        status = True
        number_of_cleanups = 0

        if self.clean:
            tolog("Executing empty dirs clean-up, stage 1/5")
            Cleaner.purgeEmptyDirs(self.path)

            tolog("Executing work dir clean-up, stage 2/5")
            Cleaner.purgeWorkDirs(self.path)

            tolog("Executing maxed-out dirs clean-up, stage 3/5")
            Cleaner.purgeMaxedoutDirs(self.path)

            tolog("Executing AthenaMP clean-up, stage 4/5 <SKIPPED>")
            #files = ['AthenaMP_*', 'fifo_*', 'TokenExtractorChannel*', 'zmq_EventService*', 'asetup*', 'tmp*.pkl']
            #for f in files:
            #    Cleaner.purgeFiles(self.path, f, limit=48*3600)

            tolog("Executing PanDA Pilot dir clean-up, stage 5/5")
            JS = JobState()

            # grab all job state files in all work directories
            job_state_files = glob(self.path +
                                   "/Panda_Pilot_*/jobState-*.pickle")
            number_of_files = len(job_state_files)
            file_number = 0
            max_cleanups = 30
            tolog("Number of found job state files: %d" % (number_of_files))
            if job_state_files:
                # loop over all found job state files
                for file_path in job_state_files:
                    file_number += 1
                    if file_number > max_cleanups:
                        tolog(
                            "Maximum number of job recoveries exceeded for this pilot: %d"
                            % (max_cleanups))
                        break
                    tolog("Processing job state file %d/%d: %s" %
                          (file_number, number_of_files, file_path))
                    current_time = int(time.time())

                    # when was file last modified?
                    try:
                        file_modification_time = os.path.getmtime(file_path)
                    except:
                        # skip this file since it was not possible to read the modification time
                        pass
                    else:
                        # was the job state file updated longer than the time limit? (convert to seconds)
                        mod_time = current_time - file_modification_time
                        if mod_time > self.limit * 3600:
                            tolog(
                                "File was last modified %d seconds ago (proceed)"
                                % (mod_time))
                            cmd = "whoami; ls -lF %s; ls -lF %s" % (
                                file_path, os.path.dirname(file_path))
                            tolog("Executing command: %s" % (cmd))
                            ec, rs = commands.getstatusoutput(cmd)
                            if ec == 0:
                                tolog("%s" % (rs))
                            else:
                                tolog("!!WARNING!!2999!! %d, %s" % (ec, rs))

                            # open the job state file
                            if JS.get(file_path):
                                # decode the job state info
                                _job, _site, _node, _recoveryAttempt = JS.decode(
                                )

                                # add member if it doesn't exist (new Job version)
                                try:
                                    _tmp = _job.prodSourceLabel
                                except:
                                    _job.prodSourceLabel = ''

                                if _job and _site and _node:
                                    # query the job state file for job information
                                    if _job.result[
                                            0] == 'running' or _job.result[
                                                0] == 'starting' or (
                                                    _job.result[0] == 'holding'
                                                    and
                                                    mod_time > 7 * 24 * 3600):
                                        if _job.result[0] == 'holding':
                                            tolog(
                                                "Job %s was found in %s state but has not been modified for a long time - will be cleaned up"
                                                % (_job.jobId, _job.result[0]))
                                        else:
                                            tolog(
                                                "Job %s was found in %s state - will be cleaned up"
                                                % (_job.jobId, _job.result[0]))
                                        tolog("Erasing directory: %s" %
                                              (_site.workdir))
                                        cmd = "rm -rf %s" % (_site.workdir)
                                        try:
                                            ec, rs = commands.getstatusoutput(
                                                cmd)
                                        except:
                                            tolog(
                                                "!!WARNING!!5500!! Could not erase lost job workdir: %d, %s"
                                                % (ec, rs))
                                            status = False
                                            break
                                        else:
                                            tolog("Lost job workdir removed")
                                    else:
                                        tolog("Job found in state: %s" %
                                              (_job.result[0]))
                        else:
                            tolog(
                                "File was last modified %d seconds ago (skip)"
                                % (mod_time))
            else:
                tolog("No job state files were found, aborting clean-up")
        else:
            tolog("Clean-up turned off")
            status = False

        return status

    def purgeEmptyDirs(path):
        """ locate and remove empty lingering dirs """

        all_dirs = glob("%s/Panda_Pilot_*" % (path))
        max_dirs = 50
        purged_nr = 0
        dir_nr = 0

        for _dir in all_dirs:
            if dir_nr >= max_dirs:
                break
            # when was the dir last modified?
            current_time = int(time.time())
            try:
                file_modification_time = os.path.getmtime(_dir)
            except:
                # skip this dir since it was not possible to read the modification time
                pass
            else:
                mod_time = current_time - file_modification_time
                if mod_time > 2 * 3600:
                    try:
                        ls = listdir(_dir)
                    except Exception, e:
                        tolog("!!WARNING!!2999!! Exception caught: %s" %
                              str(e))
                    else:
                        if len(ls) == 0 or len(ls) == 1:
                            if len(ls) == 0:
                                tolog(
                                    "Found empty dir: %s (last modified %d s ago, will now purge it)"
                                    % (_dir, mod_time))
                            else:
                                tolog(
                                    "Found empty dir: %s (last modified %d s ago, will now purge it, 1 sub dir: %s)"
                                    % (_dir, mod_time, ls[0]))

                            ec, rs = commands.getstatusoutput("rm -rf %s" %
                                                              (_dir))
                            if ec != 0:
                                tolog("Failed to remove dir: %d, %s (belonging to user %d, pilot is run by user %d)" %\
                                      (ec, rs, os.stat(_dir)[4], os.getuid()))
                            else:
                                purged_nr += 1
            dir_nr += 1
        tolog("Purged %d empty directories" % (purged_nr))
Ejemplo n.º 36
0
    def cleanup(self):
        """ execute the clean-up """

        status = True
        number_of_cleanups = 0

        if self.clean:
            tolog("Executing empty dirs clean-up, stage 1/5")
            Cleaner.purgeEmptyDirs(self.path)

            tolog("Executing work dir clean-up, stage 2/5")
            Cleaner.purgeWorkDirs(self.path)

            tolog("Executing maxed-out dirs clean-up, stage 3/5")
            Cleaner.purgeMaxedoutDirs(self.path)

            tolog("Executing AthenaMP clean-up, stage 4/5 <SKIPPED>")
            #files = ['AthenaMP_*', 'fifo_*', 'TokenExtractorChannel*', 'zmq_EventService*', 'asetup*', 'tmp*.pkl']
            #for f in files:
            #    Cleaner.purgeFiles(self.path, f, limit=48*3600)

            tolog("Executing PanDA Pilot dir clean-up, stage 5/5")
            JS = JobState()

            # grab all job state files in all work directories
            job_state_files = glob(self.path +
                                   "/Panda_Pilot_*/jobState-*.pickle")
            number_of_files = len(job_state_files)
            file_number = 0
            max_cleanups = 30
            tolog("Number of found job state files: %d" % (number_of_files))
            if job_state_files:
                # loop over all found job state files
                for file_path in job_state_files:
                    file_number += 1
                    if file_number > max_cleanups:
                        tolog(
                            "Maximum number of job recoveries exceeded for this pilot: %d"
                            % (max_cleanups))
                        break
                    tolog("Processing job state file %d/%d: %s" %
                          (file_number, number_of_files, file_path))
                    current_time = int(time.time())

                    # when was file last modified?
                    try:
                        file_modification_time = os.path.getmtime(file_path)
                    except:
                        # skip this file since it was not possible to read the modification time
                        pass
                    else:
                        # was the job state file updated longer than the time limit? (convert to seconds)
                        mod_time = current_time - file_modification_time
                        if mod_time > self.limit * 3600:
                            tolog(
                                "File was last modified %d seconds ago (proceed)"
                                % (mod_time))
                            cmd = "whoami; ls -lF %s; ls -lF %s" % (
                                file_path, os.path.dirname(file_path))
                            tolog("Executing command: %s" % (cmd))
                            ec, rs = commands.getstatusoutput(cmd)
                            if ec == 0:
                                tolog("%s" % (rs))
                            else:
                                tolog("!!WARNING!!2999!! %d, %s" % (ec, rs))

                            # open the job state file
                            if JS.get(file_path):
                                # decode the job state info
                                _job, _site, _node, _recoveryAttempt = JS.decode(
                                )

                                # add member if it doesn't exist (new Job version)
                                try:
                                    _tmp = _job.prodSourceLabel
                                except:
                                    _job.prodSourceLabel = ''

                                if _job and _site and _node:
                                    # query the job state file for job information
                                    if _job.result[
                                            0] == 'running' or _job.result[
                                                0] == 'starting' or (
                                                    _job.result[0] == 'holding'
                                                    and
                                                    mod_time > 7 * 24 * 3600):
                                        if _job.result[0] == 'holding':
                                            tolog(
                                                "Job %s was found in %s state but has not been modified for a long time - will be cleaned up"
                                                % (_job.jobId, _job.result[0]))
                                        else:
                                            tolog(
                                                "Job %s was found in %s state - will be cleaned up"
                                                % (_job.jobId, _job.result[0]))
                                        tolog("Erasing directory: %s" %
                                              (_site.workdir))
                                        cmd = "rm -rf %s" % (_site.workdir)
                                        try:
                                            ec, rs = commands.getstatusoutput(
                                                cmd)
                                        except:
                                            tolog(
                                                "!!WARNING!!5500!! Could not erase lost job workdir: %d, %s"
                                                % (ec, rs))
                                            status = False
                                            break
                                        else:
                                            tolog("Lost job workdir removed")
                                    else:
                                        tolog("Job found in state: %s" %
                                              (_job.result[0]))
                        else:
                            tolog(
                                "File was last modified %d seconds ago (skip)"
                                % (mod_time))
            else:
                tolog("No job state files were found, aborting clean-up")
        else:
            tolog("Clean-up turned off")
            status = False

        return status
Ejemplo n.º 37
0
    def __init__(self, limit=12, path="/tmp", uflag=None):
        """ Default init with verification """

        self.clean = True
        self.uflag = None
        # verify the clean-up limit
        _type = str(limit.__class__)
        if limit and _type.find('int') == -1:
            tolog("Trying to convert limit from type %s to int" % (_type))
            try:
                limit = int(limit)
            except:
                tolog("Failed to convert, reset to default")
                limit = 12
        if limit == 0:
            tolog("Clean-up limit set to zero (no clean-up will be done)")
            self.clean = False
        elif limit < 0 or not limit:
            limit = 12
            tolog(
                "!!WARNING!!5500!! Clean-up limit out of bounds, reset to default: %d"
                % (limit))

        self.limit = limit
        tolog("Cleaner initialized with clean-up limit: %d hours" %
              (self.limit))

        # verify the clean-up path and set the uflag if necessary
        if self.clean:
            if not path:
                path = "/tmp"
                tolog("Requested path reset to default: %s" % (path))
            if os.path.exists(path):
                self.path = path
                tolog(
                    "Cleaner will scan for lost directories in verified path: %s"
                    % (self.path))

                if uflag:
                    self.uflag = uflag
            else:
                tolog(
                    "!!WARNING!!5500!! No such directory: %s (clean-up not possible)"
                    % (path))
                self.path = None
                self.clean = False
Ejemplo n.º 38
0
    def stageOutFile(self, source, destination, token=None, outputDir=None):
        """Stage out the file. Should be implementated by different site mover"""
        statusRet = 0
        outputRet = {}
        outputRet["errorLog"] = None
        outputRet["report"] = {}
        outputRet["report"]["clientState"] = None

        # determine which timeout option to use
        timeout_option = "-t %d" % (self.timeout)

        #mkdir
        _cmd_str = '%s gfal-mkdir --verbose %s -p %s' % (self._setup, timeout_option, os.path.dirname(destination))
        self.log("Executing command: %s" % (_cmd_str))
        status, output = commands.getstatusoutput(_cmd_str)
        self.log("status: %s, output: %s" % (status, output.replace("\n"," ")))
        if status != 0:
            outputRet["errorLog"] = output
            outputRet["report"]["clientState"] = "ERR_MKDIR"
            return PilotErrors.ERR_MKDIR, outputRet

        # cleanup the SURL if necessary (remove port and srm substring)
        if token:
            # Special case for GROUPDISK (do not remove dst: bit before this stage, needed in several places)
            if "dst:" in token:
                token = token[len('dst:'):]
                tolog("Dropped dst: part of space token descriptor; token=%s" % (token))
                if 'DATADISK' in token:
                    token = "ATLASDATADISK"
                else:
                    token = "ATLASGROUPDISK"
                tolog("Space token descriptor reset to: %s" % (token))

            _cmd_str = '%s gfal-copy --verbose %s -D "SRM PLUGIN:TURL_PROTOCOLS=gsiftp" -S %s file:%s %s' % (self._setup, timeout_option, token, source, destination)
        else:
            # surl is the same as putfile
            _cmd_str = '%s gfal-copy --verbose %s -D "SRM PLUGIN:TURL_PROTOCOLS=gsiftp" file:%s %s' % (self._setup, timeout_option, source, destination)

        ec = -1
        t0 = os.times()
        o = '(not defined)'
        if outputDir and outputDir.endswith("PilotMVOutputDir"):
            timeStart = time()
            outputFile = os.path.join(outputDir, os.path.basename(source))
            mvCmd = "cp -f %s %s" % (source, outputFile)
            tolog("Executing command: %s" % (mvCmd))
            lstatus, loutput = commands.getstatusoutput(mvCmd)
            if lstatus != 0:
                ec = lstatus
                o = loutput
            else:
                outputFileCmd = outputFile + ".gfalcmd"
                handle = open(outputFileCmd, 'w')
                handle.write(_cmd_str.replace(source, outputFile))
                handle.close()
                tolog("Write command %s to %s" % (_cmd_str.replace(source, outputFile), outputFileCmd))
                tolog("Waiting remote to finish transfer")
                o = "Remote timeout to transfer out file"
                while (time() - timeStart) < self.timeout:
                    sleep(5)
                    if os.path.exists(outputFile + ".gfalcmdfinished"):
                        ec = 0
                        o = "Remote finished transfer"
                        tolog(o)
                        os.remove(outputFile + ".gfalcmdfinished")
                        break
                    if os.path.exists(outputFile + ".gfalcmdfailed"):
                        ec = 0
                        o = "Remote finished transfer"
                        tolog(o)
                        os.remove(outputFile + ".gfalcmdfailed")
                        break
        else:
            tolog("Executing command: %s" % (_cmd_str))
            outputRet["report"]['relativeStart'] = time()
            outputRet["report"]['transferStart'] =  time()
            try:
                ec, o = commands.getstatusoutput(_cmd_str)
            except Exception, e:
                tolog("!!WARNING!!2999!! gfal-copy threw an exception: %s" % (o))
                o = str(e)
Ejemplo n.º 39
0
                        tolog(o)
                        os.remove(outputFile + ".gfalcmdfailed")
                        break
        else:
            tolog("Executing command: %s" % (_cmd_str))
            outputRet["report"]['relativeStart'] = time()
            outputRet["report"]['transferStart'] =  time()
            try:
                ec, o = commands.getstatusoutput(_cmd_str)
            except Exception, e:
                tolog("!!WARNING!!2999!! gfal-copy threw an exception: %s" % (o))
                o = str(e)
        outputRet["report"]['validateStart'] = time()
        t1 = os.times()
        t = t1[4] - t0[4]
        tolog("Command finished after %f s" % (t))
        tolog("ec = %d, output = %s" % (ec, o.replace("\n"," ")))

        if ec != 0:
            tolog("!!WARNING!!2990!! Command failed: %s" % (_cmd_str))
            #check_syserr(ec, o)
            tolog('!!WARNING!!2990!! Stage Out failed: Status=%d Output=%s' % (ec, str(o.replace("\n"," "))))

            status, output = self.errorToReport(o, t, source, stageMethod="stageOut")
            if status == PilotErrors.ERR_FILEEXIST:
                return status, output

            # check if file was partially transferred, if so, remove it
            _ec, removeOutput = self.removeRemoteFile(destination)
            if not _ec :
                self.log("Failed to remove file ") # i.e. do not retry stage-out
Ejemplo n.º 40
0
    def errorToReport(self, errorOutput, timeUsed, fileName, stageMethod='stageIN'):
        status = 0
        outputRet = {}
        outputRet["errorLog"] = None
        outputRet["report"] = {}
        outputRet["report"]["clientState"] = None

        if "File exists" in errorOutput or "SRM_FILE_BUSY" in errorOutput:
            pilotErrorDiag = "File already exist in the destination."
            tolog("!!WARNING!!2990!! %s" % (pilotErrorDiag))
            #self.prepareReport('FILE_EXIST', report)
            outputRet["report"]["clientState"] = 'FILE_EXIST'
            outputRet["errorLog"] = pilotErrorDiag
            return PilotErrors.ERR_FILEEXIST, outputRet
        elif "Could not establish context" in errorOutput:
            pilotErrorDiag = "Could not establish context: Proxy / VO extension of proxy has probably expired"
            tolog("!!WARNING!!2990!! %s" % (pilotErrorDiag))
            #self.prepareReport('CONTEXT_FAIL', report)
            outputRet["report"]["clientState"] = 'CONTEXT_FAIL'
            outputRet["errorLog"] = pilotErrorDiag
            return PilotErrors.ERR_NOPROXY, outputRet
        elif "globus_xio:" in errorOutput:
            pilotErrorDiag = "Globus system error: %s" % (errorOutput)
            self.log("Globus system error encountered")
            #self.prepareReport('GLOBUS_FAIL', report)
            outputRet["report"]["clientState"] = 'GLOBUS_FAIL'
            outputRet["errorLog"] = pilotErrorDiag
            return PilotErrors.ERR_GETGLOBUSSYSERR, outputRet
        elif "No space left on device" in errorOutput:
            pilotErrorDiag = "No available space left on local disk: %s" % (errorOutput)
            tolog("No available space left on local disk")
            #self.prepareReport('NO_SPACE', report)
            outputRet["report"]["clientState"] = 'NO_SPACE'
            outputRet["errorLog"] = pilotErrorDiag
            return PilotErrors.ERR_NOLOCALSPACE, outputRet
        elif "No such file or directory" in errorOutput:
            if "DBRelease" in fileName:
                pilotErrorDiag = "Missing DBRelease file: %s" % (fileName)
                tolog("!!WARNING!!2990!! %s" % (pilotErrorDiag))
                #self.prepareReport('NO_DBREL', report)
                outputRet["report"]["clientState"] = 'NO_DBREL'
                outputRet["errorLog"] = pilotErrorDiag
                return PilotErrors.ERR_MISSDBREL, outputRet
            else:
                pilotErrorDiag = "No such file or directory: %s" % (fileName)
                tolog("!!WARNING!!2990!! %s" % (pilotErrorDiag))
                #self.prepareReport('NO_FILE_DIR', report)
                outputRet["report"]["clientState"] = 'NO_FILE'
                outputRet["errorLog"] = pilotErrorDiag
                return PilotErrors.ERR_NOSUCHFILE, outputRet
        else:
            if timeUsed >= self.timeout:
                pilotErrorDiag = "Copy command self timed out after %d s" % (timeUsed)
                tolog("!!WARNING!!2990!! %s" % (pilotErrorDiag))
                if stageMethod == "stageIN":
                    #self.prepareReport('GET_TIMEOUT', report)
                    outputRet["report"]["clientState"] = 'GET_TIMEOUT'
                    outputRet["errorLog"] = pilotErrorDiag
                    return PilotErrors.ERR_GETTIMEOUT, pilotErrorDiag
                else:
                    #self.prepareReport('CP_TIMEOUT', report)
                    outputRet["report"]["clientState"] = 'CP_TIMEOUT'
                    outputRet["errorLog"] = pilotErrorDiag
                    return PilotErrors.ERR_PUTTIMEOUT, outputRet
            else:
                if len(errorOutput) == 0:
                    pilotErrorDiag = "Copy command returned error code %d but no output" % (s)
                else:
                    pilotErrorDiag = errorOutput
                #self.prepareReport('COPY_ERROR', report)
                outputRet["report"]["clientState"] = 'COPY_ERROR'
                outputRet["errorLog"] = pilotErrorDiag
                if stageMethod == "stageIN":
                    return PilotErrors.ERR_STAGEINFAILED, outputRet
                else:
                    return PilotErrors.ERR_STAGEOUTFAILED, outputRet
Ejemplo n.º 41
0
import time

from pUtil import tolog, convert, getSiteInformation, readpar

def openFile(filename, mode):
    """ Open and return a file pointer for the given mode """
    # Note: caller needs to close the file

    f = None
    if os.path.exists(filename):
        try:
            f = open(filename, mode)
        except IOError, e:
            tolog("!!WARNING!!2997!! Caught exception: %s" % (e))
    else:
        tolog("!!WARNING!!2998!! File does not exist: %s" % (filename))

    return f

def getJSONDictionary(filename):
    """ Read a dictionary with unicode to utf-8 conversion """

    dictionary = None
    from json import load
    f = openFile(filename, 'r')
    if f:
        try:
            dictionary = load(f)
        except Exception, e:
            tolog("!!WARNING!!2222!! Failed to load json dictionary: %s" % (e))
        else:
Ejemplo n.º 42
0
 def log(self, errorLog):
     tolog(errorLog)
Ejemplo n.º 43
0
    def extractAppdir(self):
        """ Called by pilot.py, runMain method """
        tolog("CMSExperiment - extractAppdir - nothing to do")

        return 0, ""
Ejemplo n.º 44
0
def extractOutputFilesFromJSON(workDir, allowNoOutput):
    """ In case the trf has produced additional output files (spill-over), extract all output files from the jobReport """
    # Note: ignore files with nentries = 0

    output_files = []
    guids = []
    tolog("Extracting output files from jobReport")

    jobReport_dictionary = getJobReport(workDir)
    if jobReport_dictionary != {}:

        if jobReport_dictionary.has_key('files'):
            file_dictionary = jobReport_dictionary['files']
            if file_dictionary.has_key('output'):
                output_file_list = file_dictionary['output']
                for f_dictionary in output_file_list:
                    if f_dictionary.has_key('subFiles'):
                        subFiles_list = f_dictionary['subFiles']
                        for f_names_dictionary in subFiles_list:
                            if f_names_dictionary.has_key('name') and f_names_dictionary.has_key('nentries'):
                                # Only add the file is nentries > 0
                                if type(f_names_dictionary['nentries']) == int and f_names_dictionary['nentries'] > 0:
                                    output_files.append(f_names_dictionary['name'])

                                    # Also get the file guid
                                    if f_names_dictionary.has_key('file_guid'):
                                        guids.append(f_names_dictionary['file_guid'])
                                    else:
                                        tolog("!!WARNING!!1212!! Did not find any guid for this file: %s (will be generated)" % (f_names_dictionary['name']))
                                        guids.append(None)
                                else:
                                    # Only ignore the file if it is allowed to be ignored
                                    if not type(f_names_dictionary['nentries']) == int:
                                        tolog("!!WARNING!!4542!! nentries is not a number: %s" % str(f_names_dictionary['nentries']))

                                    # Special handling for origName._NNN
                                    # origName._NNN are unmerged files dynamically produced by AthenaMP. Job definition doesn't
                                    # explicitly specify those names but only the base names, thus allowNoOutput contains only base names
                                    # in this case. We want to ignore origName._NNN when allowNoOutput=['origName']
                                    from re import compile
                                    allowNoOutputEx = [compile(s+'\.?_\d+$') for s in allowNoOutput]
                                    if f_names_dictionary['name'] in allowNoOutput or any(patt.match(f_names_dictionary['name']) for patt in allowNoOutputEx):
                                        tolog("Ignoring file %s since nentries=%s" % (f_names_dictionary['name'], str(f_names_dictionary['nentries'])))
                                    else:
                                        tolog("Will not ignore empty file %s since file is not in allowNoOutput list" % (f_names_dictionary['name']))
                                        output_files.append(f_names_dictionary['name'])

                                        # Also get the file guid
                                        if f_names_dictionary.has_key('file_guid'):
                                            guids.append(f_names_dictionary['file_guid'])
                                        else:
                                            tolog("!!WARNING!!1212!! Did not find any guid for this file: %s (will be generated)" % (f_names_dictionary['name']))
                                            guids.append(None)

                            else:
                                tolog("No such key: name/nentries")
                    else:
                        tolog("No such key: subFiles")
            else:
                tolog("No such key: output")
        else:
            tolog("No such key: files")

        if len(output_files) == 0:
            tolog("No output files found in jobReport")
        else:
            tolog("Output files found in jobReport: %s" % (output_files))

    return output_files, guids
Ejemplo n.º 45
0
    def verifySwbase(self, appdir):
        """ Called by pilot.py, check needed for handleQueuedata method """
        tolog("CMSExperiment - verifySwbase - nothing to do")

        return 0
Ejemplo n.º 46
0
    def interpretPayloadStdout(self, job, res, getstatusoutput_was_interrupted,
                               current_job_number, runCommandList,
                               failureCode):
        """ payload error handling """

        # NOTE: Move away ATLAS specific info in this method, e.g. vmPeak stuff

        error = PilotErrors()
        #Mancinelli: moved it in experiment class method handleTrfExitcode
        #transExitCode = res[0]%255
        tolog("Mancinellidebug: res = %s res[0] = %s" % (res, res[0]))

        # Get the proper stdout filename
        number_of_jobs = len(runCommandList)
        filename = getStdoutFilename(job.workdir, job.stdout,
                                     current_job_number, number_of_jobs)

        # Try to identify out of memory errors in the stderr
        out_of_memory = self.isOutOfMemory(job=job,
                                           number_of_jobs=number_of_jobs)
        failed = out_of_memory  # failed boolean used below

        # A killed job can have empty output but still transExitCode == 0
        no_payload_output = False
        installation_error = False
        if getstatusoutput_was_interrupted:
            if os.path.exists(filename):
                if os.path.getsize(filename) > 0:
                    tolog(
                        "Payload produced stdout but was interrupted (getstatusoutput threw an exception)"
                    )
                else:
                    no_payload_output = True
                failed = True
            else:
                failed = True
                no_payload_output = True
        elif len(
                res[1]
        ) < 20:  # protect the following comparison against massive outputs
            if res[1] == 'Undefined':
                failed = True
                no_payload_output = True
        elif failureCode:
            failed = True
        else:
            # check for installation error
            res_tmp = res[1][:1024]
            if res_tmp[
                    0:
                    3] == "sh:" and 'setup.sh' in res_tmp and 'No such file or directory' in res_tmp:
                failed = True
                installation_error = True

        if res[0] or failed:
            #Mancinelli: all this common part with CMS?
            if failureCode:
                job.pilotErrorDiag = "Payload failed: Interrupt failure code: %d" % (
                    failureCode)
                # (do not set pilot error code)
            elif getstatusoutput_was_interrupted:
                raise Exception, "Job execution was interrupted (see stderr)"
            elif out_of_memory:
                job.pilotErrorDiag = "Payload ran out of memory"
                job.result[2] = error.ERR_ATHENAOUTOFMEMORY
            elif no_payload_output:
                job.pilotErrorDiag = "Payload failed: No output"
                job.result[2] = error.ERR_NOPAYLOADOUTPUT
            elif installation_error:
                job.pilotErrorDiag = "Payload failed: Missing installation"
                job.result[2] = error.ERR_MISSINGINSTALLATION
            elif res[0]:
                #Mancinelli: calling for experiment class method to manage transformation exit code
                job = self.handleTrfExitcode(job, res, error, filename)
            else:
                job.pilotErrorDiag = "Payload failed due to unknown reason (check payload stdout)"
                job.result[2] = error.ERR_UNKNOWN
            tolog("!!FAILED!!3000!! %s" % (job.pilotErrorDiag))

        # handle non-zero failed job return code but do not set pilot error codes to all payload errors
        """
        if transExitCode or failed:
            if failureCode:
                job.pilotErrorDiag = "Payload failed: Interrupt failure code: %d" % (failureCode)
                # (do not set pilot error code)
            elif getstatusoutput_was_interrupted:
                raise Exception, "Job execution was interrupted (see stderr)"
            elif out_of_memory:
                job.pilotErrorDiag = "Payload ran out of memory"
                job.result[2] = error.ERR_ATHENAOUTOFMEMORY
            elif no_payload_output:
                job.pilotErrorDiag = "Payload failed: No output"
                job.result[2] = error.ERR_NOPAYLOADOUTPUT
            elif installation_error:
                job.pilotErrorDiag = "Payload failed: Missing installation"
                job.result[2] = error.ERR_MISSINGINSTALLATION
            elif transExitCode:
                # Handle PandaMover errors
                if transExitCode == 176:
                    job.pilotErrorDiag = "PandaMover staging error: File is not cached"
                    job.result[2] = error.ERR_PANDAMOVERFILENOTCACHED
                elif transExitCode == 86:
                    job.pilotErrorDiag = "PandaMover transfer failure"
                    job.result[2] = error.ERR_PANDAMOVERTRANSFER
                else:
                    # check for specific errors in athena stdout
                    if os.path.exists(filename):
                        e1 = "prepare 5 database is locked"
                        e2 = "Error SQLiteStatement"
                        _out = commands.getoutput('grep "%s" %s | grep "%s"' % (e1, filename, e2))
                        if 'sqlite' in _out:
                            job.pilotErrorDiag = "NFS/SQLite locking problems: %s" % (_out)
                            job.result[2] = error.ERR_NFSSQLITE
                        else:
                            job.pilotErrorDiag = "Job failed: Non-zero failed job return code: %d" % (transExitCode)
                            # (do not set a pilot error code)
                    else:
                        job.pilotErrorDiag = "Job failed: Non-zero failed job return code: %d (%s does not exist)" % (transExitCode, filename)
                        # (do not set a pilot error code)
            else:
                job.pilotErrorDiag = "Payload failed due to unknown reason (check payload stdout)"
                job.result[2] = error.ERR_UNKNOWN
            tolog("!!FAILED!!3000!! %s" % (job.pilotErrorDiag))

        # set the trf diag error
        if res[2] != "":
            tolog("TRF diagnostics: %s" % (res[2]))
            job.exeErrorDiag = res[2]

        job.result[1] = transExitCode
        """
        return job
Ejemplo n.º 47
0
    def getCMSRunCommand(self, job, jobSite, trfName):

        from RunJobUtilities import updateCopysetups

        ec = 0
        pilotErrorDiag = ""
        run_command = ""

        # get relevant file transfer info
        dInfo, useCopyTool, useDirectAccess, useFileStager, oldPrefix, newPrefix, copysetup, usePFCTurl, lfcHost =\
               self.getFileTransferInfo(job.transferType, isBuildJob(job.outFiles))

        # extract the setup file from copysetup (and verify that it exists)
        _copysetup = self.getSetupFromCopysetup(copysetup)
        tolog("copysetup = %s" % _copysetup)
        if _copysetup != "" and os.path.exists(_copysetup):
            run_command = 'source %s; ' % (_copysetup)

        # add the user proxy
        if os.environ.has_key('X509_USER_PROXY'):
            run_command += 'export X509_USER_PROXY=%s; ' % os.environ[
                'X509_USER_PROXY']
        else:
            tolog(
                "Could not add user proxy to the run command (proxy does not exist)"
            )
        """
        strpars = job.jobPars
        cmdopt = shlex.split(strpars)
        parser = PassThroughOptionParser()
        parser.add_option('-a',\
                          dest='a',\
                          type='string')
        parser.add_option('-o',\
                          dest='o',\
                          type='string')
        parser.add_option('--inputFile',\
                          dest='inputFile',\
                          type='string')
        parser.add_option('--sourceURL',\
                          dest='sourceURL',\
                          type='string')
        parser.add_option('--jobNumber',\
                          dest='jobNumber',\
                          type='string')
        parser.add_option('--cmsswVersion',\
                          dest='cmsswVersion',\
                          type='string')
        parser.add_option('--scramArch',\
                          dest='scramArch',\
                          type='string')
        parser.add_option('--runAndLumis',\
                          dest='runAndLumis',\
                          type='string')
        (options,args) = parser.parse_args(cmdopt)

        paramsstring  = '-a %s '                % options.a
        paramsstring += '--sourceURL %s '       % options.sourceURL
        paramsstring += '--jobNumber=%s '       % options.jobNumber
        paramsstring += '--cmsswVersion=%s '    % options.cmsswVersion
        paramsstring += '--scramArch=%s '       % options.scramArch
        paramsstring += "--inputFile='%s' "     % options.inputFile
        paramsstring += "--runAndLumis='%s' "   % options.runAndLumis
        paramsstring += '-o "%s" '              % options.o

        tolog("paramsstring = %s" % paramsstring)
        """
        run_command += './%s %s' % (trfName, job.jobPars)

        return ec, pilotErrorDiag, run_command
Ejemplo n.º 48
0
    def checkSpecialEnvVars(self, sitename):
        """ Called by pilot.py """
        tolog("CMSExperiment - checkSpecialEnvVars - nothing to do")

        return 0
Ejemplo n.º 49
0
    def finishJob(self):
        try:
            self.__hpcManager.finishJob()
        except:
            tolog(sys.exc_info()[1])
            tolog(sys.exc_info()[2])

        # If payload leaves the input files, delete them explicitly
        if self.__job.inFiles:
            ec = pUtil.removeFiles(self.__job.workdir, self.__job.inFiles)
        #if self.__output_es_files:
        #    ec = pUtil.removeFiles("/", self.__output_es_files)

        errorCode = PilotErrors.ERR_UNKNOWN
        if self.__job.attemptNr < 4:
            errorCode = PilotErrors.ERR_ESRECOVERABLE

        #check HPC job status
        #if self.__hpcStatus:
        #    self.failJob(0, 1220, self.__job, pilotErrorDiag="HPC job failed")

        if len(self.__eventRanges) == 0:
            tolog("Cannot get event ranges")
            self.failJob(0,
                         errorCode,
                         self.__job,
                         pilotErrorDiag="Cannot get event ranges")

        # check whether all event ranges are handled
        tolog("Total event ranges: %s" % len(self.__eventRanges))
        not_handled_events = self.__eventRanges.values().count('new')
        tolog("Not handled events: %s" % not_handled_events)
        done_events = self.__eventRanges.values().count('Done')
        tolog("Finished events: %s" % done_events)
        stagedOut_events = self.__eventRanges.values().count('stagedOut')
        tolog("stagedOut but not updated to panda server events: %s" %
              stagedOut_events)
        if done_events + stagedOut_events:
            errorCode = PilotErrors.ERR_ESRECOVERABLE
        if not_handled_events + stagedOut_events:
            tolog("Not all event ranges are handled. failed job")
            self.failJob(
                0,
                errorCode,
                self.__job,
                pilotErrorDiag="Not All events are handled(total:%s, left:%s)"
                % (len(self.__eventRanges),
                   not_handled_events + stagedOut_events))

        dsname, datasetDict = self.getDatasets()
        tolog("dsname = %s" % (dsname))
        tolog("datasetDict = %s" % (datasetDict))

        # Create the output file dictionary needed for generating the metadata
        ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(
            self.__job.outFiles,
            self.__job.logFile,
            self.__job.workdir,
            fullpath=True)
        if ec:
            # missing output file (only error code from prepareOutFiles)
            self.failJob(self.__job.result[1],
                         ec,
                         self.__job,
                         pilotErrorDiag=pilotErrorDiag)
        tolog("outsDict: %s" % str(outsDict))

        # Create metadata for all successfully staged-out output files (include the log file as well, even if it has not been created yet)
        ec, job, outputFileInfo = self.createFileMetadata(
            [], self.__job, outsDict, dsname, datasetDict,
            self.__jobSite.sitename)
        if ec:
            self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)

        # Rename the metadata produced by the payload
        # if not pUtil.isBuildJob(outs):
        self.moveTrfMetadata(self.__job.workdir, self.__job.jobId)

        # Check the job report for any exit code that should replace the res_tuple[0]
        res0, exitAcronym, exitMsg = self.getTrfExitInfo(0, self.__job.workdir)
        res = (res0, exitMsg, exitMsg)

        # Payload error handling
        ed = ErrorDiagnosis()
        job = ed.interpretPayload(self.__job, res, False, 0,
                                  self.__runCommandList, self.getFailureCode())
        if job.result[1] != 0 or job.result[2] != 0:
            self.failJob(job.result[1],
                         job.result[2],
                         job,
                         pilotErrorDiag=job.pilotErrorDiag)
        self.__job = job

        job.jobState = "finished"
        job.setState([job.jobState, 0, 0])
        job.jobState = job.result
        rt = RunJobUtilities.updatePilotServer(job,
                                               self.getPilotServer(),
                                               self.getPilotPort(),
                                               final=True)

        tolog("Done")
        self.sysExit(self.__job)
Ejemplo n.º 50
0
    def getJobExecutionCommand(self, job, jobSite, pilot_initdir):
        """ Define and test the command(s) that will be used to execute the payload """

        # Input tuple: (method is called from RunJob*)
        #   job: Job object
        #   jobSite: Site object
        #   pilot_initdir: launch directory of pilot.py
        #
        # Return tuple:
        #   pilot_error_code, pilot_error_diagnostics, job_execution_command, special_setup_command, JEM, cmtconfig
        # where
        #   pilot_error_code       : self.__error.<PILOT ERROR CODE as defined in PilotErrors class> (value should be 0 for successful setup)
        #   pilot_error_diagnostics: any output from problematic command or explanatory error diagnostics
        #   job_execution_command  : command to execute payload, e.g. cmd = "source <path>/setup.sh; <path>/python trf.py [options]"
        #   special_setup_command  : any special setup command that can be insterted into job_execution_command and is sent to stage-in/out methods
        #   JEM                    : Job Execution Monitor activation state (default value "NO", meaning JEM is not to be used. See JEMstub.py)
        #   cmtconfig              : cmtconfig symbol from the job def or schedconfig, e.g. "x86_64-slc5-gcc43-opt"

        pilotErrorDiag = ""
        cmd = ""
        JEM = "NO"

        # Is it's an analysis job or not?
        isCMSRunJob = self.isCMSRunJob(job.trf)
        tolog("isCMSRunJob = %s " % isCMSRunJob)

        # Command used to download trf
        wgetCommand = 'wget'

        # Get the cmtconfig value
        cmtconfig = getCmtconfig(job.cmtconfig)
        if cmtconfig != "":
            tolog("cmtconfig: %s" % (cmtconfig))

        # Set python executable
        ec, pilotErrorDiag, pybin = self.setPython()
        if ec == self.__error.ERR_MISSINGINSTALLATION:
            return ec, pilotErrorDiag, "", special_setup_cmd, JEM, cmtconfig

        # Define the job execution command
        if isCMSRunJob:
            # Try to download the analysis trf
            status, pilotErrorDiag, trfName = self.getAnalysisTrf(
                wgetCommand, job.trf, pilot_initdir)
            if status != 0:
                return status, pilotErrorDiag, "", special_setup_cmd, JEM, cmtconfig

            scramArchSetup = self.getScramArchSetupCommand(job)
            ec, pilotErrorDiag, cmdtrf = self.getCMSRunCommand(
                job, jobSite, trfName)
            cmd = "%s %s" % (scramArchSetup, cmdtrf)

        # Set special_setup_cmd if necessary
        special_setup_cmd = self.getSpecialSetupCommand()
        if special_setup_cmd != "":
            tolog("Special setup command: %s" % (special_setup_cmd))

        # Pipe stdout/err for payload to files
        cmd += " 1>%s 2>%s" % (job.stdout, job.stderr)
        tolog("\nCommand to run the job is: \n%s" % (cmd))

        return 0, pilotErrorDiag, cmd, special_setup_cmd, JEM, cmtconfig
Ejemplo n.º 51
0
class RunJobHpcEvent(RunJob):

    # private data members
    __runjob = "RunJobHpcEvent"  # String defining the sub class
    __instance = None  # Boolean used by subclasses to become a Singleton

    #__error = PilotErrors()                     # PilotErrors object

    # Required methods

    def __init__(self):
        """ Default initialization """

        # e.g. self.__errorLabel = errorLabel
        pass
        self.__output_es_files = []
        self.__eventRanges = {}
        self.__failedStageOuts = []
        self._hpcManager = None

    def __new__(cls, *args, **kwargs):
        """ Override the __new__ method to make the class a singleton """

        if not cls.__instance:
            cls.__instance = super(RunJob, cls).__new__(cls, *args, **kwargs)

        return cls.__instance

    def getRunJob(self):
        """ Return a string with the experiment name """

        return self.__runjob

    def getRunJobFileName(self):
        """ Return the filename of the module """

        return super(RunJobHpcEvent, self).getRunJobFileName()

    # def argumentParser(self):  <-- see example in RunJob.py

    def allowLoopingJobKiller(self):
        """ Should the pilot search for looping jobs? """

        # The pilot has the ability to monitor the payload work directory. If there are no updated files within a certain
        # time limit, the pilot will consider the as stuck (looping) and will kill it. The looping time limits are set
        # in environment.py (see e.g. loopingLimitDefaultProd)

        return False

    def setupHPCEvent(self):
        self.__jobSite = Site.Site()
        self.__jobSite.setSiteInfo(self.argumentParser())
        ## For HPC job, we don't need to reassign the workdir
        # reassign workdir for this job
        self.__jobSite.workdir = self.__jobSite.wntmpdir
        if not os.path.exists(self.__jobSite.workdir):
            os.makedirs(self.__jobSite.workdir)

        tolog("runJobHPCEvent.getPilotLogFilename=%s" %
              self.getPilotLogFilename())
        if self.getPilotLogFilename() != "":
            pUtil.setPilotlogFilename(self.getPilotLogFilename())

        # set node info
        self.__node = Node.Node()
        self.__node.setNodeName(os.uname()[1])
        self.__node.collectWNInfo(self.__jobSite.workdir)

        # redirect stderr
        #sys.stderr = open("%s/runJobHPCEvent.stderr" % (self.__jobSite.workdir), "w")

        tolog("Current job workdir is: %s" % os.getcwd())
        tolog("Site workdir is: %s" % self.__jobSite.workdir)

        # get the experiment object
        self.__thisExperiment = getExperiment(self.getExperiment())
        tolog("runEvent will serve experiment: %s" %
              (self.__thisExperiment.getExperiment()))

    def getHPCEventJobFromPanda(self):
        pass

    def getHPCEventJobFromEnv(self):
        tolog("getHPCEventJobFromEnv")
        try:
            # always use this filename as the new jobDef module name
            import newJobDef
            job = Job.Job()
            job.setJobDef(newJobDef.job)
            job.coreCount = 0
            job.workdir = self.__jobSite.workdir
            job.experiment = self.getExperiment()
            # figure out and set payload file names
            job.setPayloadName(self.__thisExperiment.getPayloadName(job))
            # reset the default job output file list which is anyway not correct
            job.outFiles = []
        except Exception, e:
            pilotErrorDiag = "Failed to process job info: %s" % str(e)
            tolog("!!WARNING!!3000!! %s" % (pilotErrorDiag))
            self.failJob(0,
                         PilotErrors.ERR_UNKNOWN,
                         job,
                         pilotErrorDiag=pilotErrorDiag)

        self.__job = job
        # prepare for the output file data directory
        # (will only created for jobs that end up in a 'holding' state)
        self.__job.datadir = self.getParentWorkDir() + "/PandaJob_%s_data" % (
            job.jobId)

        # See if it's an analysis job or not
        trf = self.__job.trf
        self.__analysisJob = isAnalysisJob(trf.split(",")[0])

        # Setup starts here ................................................................................

        # Update the job state file
        self.__job.jobState = "starting"
        self.__job.setHpcStatus('init')

        # Send [especially] the process group back to the pilot
        self.__job.setState([self.__job.jobState, 0, 0])
        self.__job.jobState = self.__job.result
        rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(),
                                               runJob.getPilotPort())

        self.__JR = JobRecovery(pshttpurl='https://pandaserver.cern.ch',
                                pilot_initdir=self.__job.workdir)
        self.__JR.updateJobStateTest(self.__job,
                                     self.__jobSite,
                                     self.__node,
                                     mode="test")
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node,
                                    25443)

        # prepare the setup and get the run command list
        ec, runCommandList, job, multi_trf = self.setup(
            self.__job, self.__jobSite, self.__thisExperiment)
        if ec != 0:
            tolog("!!WARNING!!2999!! runJob setup failed: %s" %
                  (job.pilotErrorDiag))
            self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag)
        tolog("Setup has finished successfully")
        self.__job = job
        self.__runCommandList = runCommandList
        self.__multi_trf = multi_trf

        # job has been updated, display it again
        self.__job.displayJob()
        tolog("RunCommandList: %s" % self.__runCommandList)
        tolog("Multi_trf: %s" % self.__multi_trf)
Ejemplo n.º 52
0
    def getExpSpecificMetadata(self, job, workdir):
        """ Return metadata extracted from jobReport.json"""

        fwjrMetadata = ''
        fwjrFile = os.path.join(workdir, "jobReport.json")
        tolog("Looking for jobReport file")
        if os.path.exists(fwjrFile):
            tolog("Found jobReport: %s" % fwjrFile)
            try:
                f = open(fwjrFile, 'r')
                for line in f.readlines():
                    fwjrMetadata += line
            except Exception, e:
                tolog("Failed to open jobReport file: %s" % str(e))
        else:
            tolog("jobReport not found in %s " % fwjrFile)

        return fwjrMetadata

    def handleTrfExitcode(self, job, res, error, filename):
        transExitCode = res[0]
        #Mancinelli: TODO map CMS transformation error codes with error messages
        if transExitCode:
            # Handle PandaMover errors
            # Mancinelli: do we need this?
            if transExitCode == 176:
                job.pilotErrorDiag = "PandaMover staging error: File is not cached"
                job.result[2] = error.ERR_PANDAMOVERFILENOTCACHED
            elif transExitCode == 86:
                job.pilotErrorDiag = "PandaMover transfer failure"
                job.result[2] = error.ERR_PANDAMOVERTRANSFER
Ejemplo n.º 53
0
        if status == 0:
            try:
                #self.updateEventRange(eventRangeID)
                self.__eventRanges[eventRangeID] = 'stagedOut'
                tolog("Remove staged out output file: %s" % output)
                os.remove(output)
            except Exception, e:
                tolog(
                    "!!WARNING!!2233!! remove ouput file threw an exception: %s"
                    % (e))
                #self.__failedStageOuts.append(output_info)
            else:
                tolog("remove output file has returned")
        else:
            tolog(
                "!!WARNING!!1164!! Failed to upload file to objectstore: %d, %s"
                % (status, pilotErrorDiag))
            self.__failedStageOuts.append(output_info)

    def getDefaultResources(self):
        siteInfo = getSiteInformation(self.getExperiment())
        catchalls = siteInfo.readpar("catchall")
        values = {}
        for catchall in catchalls.split(","):
            if '=' in catchall:
                values[catchall.split('=')[0]] = catchall.split('=')[1]
        res = {}
        res['queue'] = values.get('queue', 'regular')
        res['mppwidth'] = values.get('mppwidth', 48)
        res['mppnppn'] = values.get('mppnppn', 1)
        res['walltime_m'] = values.get('walltime_m', 30)
Ejemplo n.º 54
0
    def runHPCEvent(self):
        tolog("runHPCEvent")
        self.__job.jobState = "running"
        self.__job.setState([self.__job.jobState, 0, 0])
        self.__job.pilotErrorDiag = None
        rt = RunJobUtilities.updatePilotServer(self.__job,
                                               self.getPilotServer(),
                                               self.getPilotPort())
        self.__JR.updateJobStateTest(self.__job,
                                     self.__jobSite,
                                     self.__node,
                                     mode="test")

        defRes = self.getDefaultResources()
        if defRes['copy_input_files'] == 'true':
            self.__copyInputFiles = True
        else:
            self.__copyInputFiles = False

        status, output, hpcJob = self.prepareHPCJob()
        if status == 0:
            tolog("HPC Job: %s " % hpcJob)
        else:
            tolog("failed to create the Tag file")
            self.failJob(0,
                         PilotErrors.ERR_UNKNOWN,
                         self.__job,
                         pilotErrorDiag=output)
            return

        self.__hpcStatus = None
        self.__hpcLog = None

        logFileName = None
        tolog("runJobHPCEvent.getPilotLogFilename=%s" %
              self.getPilotLogFilename())
        if self.getPilotLogFilename() != "":
            logFileName = self.getPilotLogFilename()
        hpcManager = HPCManager(globalWorkingDir=self.__job.workdir,
                                logFileName=logFileName,
                                poolFileCatalog=self.__poolFileCatalogTemp,
                                inputFiles=self.__inputFilesGlobal,
                                copyInputFiles=self.__copyInputFiles)

        self.__hpcManager = hpcManager
        self.HPCMode = "HPC_" + hpcManager.getMode(defRes)
        self.__job.setMode(self.HPCMode)
        self.__job.setHpcStatus('waitingResource')
        rt = RunJobUtilities.updatePilotServer(self.__job,
                                               self.getPilotServer(),
                                               self.getPilotPort())
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node,
                                    25443)

        hpcManager.getFreeResources(defRes)
        self.__job.coreCount = hpcManager.getCoreCount()
        self.__job.setHpcStatus('gettingEvents')
        rt = RunJobUtilities.updatePilotServer(self.__job,
                                               self.getPilotServer(),
                                               self.getPilotPort())
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node,
                                    25443)

        numRanges = hpcManager.getEventsNumber()
        tolog(
            "HPC Manager needs events: %s, max_events: %s; use the smallest one."
            % (numRanges, defRes['max_events']))
        if numRanges > int(defRes['max_events']):
            numRanges = int(defRes['max_events'])
        eventRanges = self.getEventRanges(numRanges=numRanges)
        #tolog("Event Ranges: %s " % eventRanges)
        if len(eventRanges) == 0:
            tolog("Get no Event ranges. return")
            return
        for eventRange in eventRanges:
            self.__eventRanges[eventRange['eventRangeID']] = 'new'

        # setup stage out
        self.setupStageOutHPCEvent()

        hpcManager.initJob(hpcJob)
        hpcManager.initEventRanges(eventRanges)

        hpcManager.submit()
        threadpool = ThreadPool(defRes['stageout_threads'])

        old_state = None
        time_start = time.time()
        while not hpcManager.isFinished():
            state = hpcManager.poll()
            self.__job.setHpcStatus(state)
            if old_state is None or old_state != state or time.time() > (
                    time_start + 60 * 10):
                old_state = state
                time_start = time.time()
                tolog("HPCManager Job stat: %s" % state)
                self.__JR.updateJobStateTest(self.__job,
                                             self.__jobSite,
                                             self.__node,
                                             mode="test")
                rt = RunJobUtilities.updatePilotServer(self.__job,
                                                       self.getPilotServer(),
                                                       self.getPilotPort())
                self.__JR.updatePandaServer(self.__job, self.__jobSite,
                                            self.__node, 25443)

            if state and state == 'Complete':
                break
            outputs = hpcManager.getOutputs()
            for output in outputs:
                #self.stageOutHPCEvent(output)
                threadpool.add_task(self.stageOutHPCEvent, output)

            time.sleep(30)
            self.updateHPCEventRanges()

        tolog("HPCManager Job Finished")
        self.__job.setHpcStatus('stagingOut')
        rt = RunJobUtilities.updatePilotServer(self.__job,
                                               self.getPilotServer(),
                                               self.getPilotPort())
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node,
                                    25443)

        outputs = hpcManager.getOutputs()
        for output in outputs:
            #self.stageOutHPCEvent(output)
            threadpool.add_task(self.stageOutHPCEvent, output)

        self.updateHPCEventRanges()
        threadpool.wait_completion()
        self.updateHPCEventRanges()

        if len(self.__failedStageOuts) > 0:
            tolog("HPC Stage out retry 1")
            half_stageout_threads = defRes['stageout_threads'] / 2
            if half_stageout_threads < 1:
                half_stageout_threads = 1
            threadpool = ThreadPool(half_stageout_threads)
            failedStageOuts = self.__failedStageOuts
            self.__failedStageOuts = []
            for failedStageOut in failedStageOuts:
                threadpool.add_task(self.stageOutHPCEvent, failedStageOut)
            threadpool.wait_completion()
            self.updateHPCEventRanges()

        if len(self.__failedStageOuts) > 0:
            tolog("HPC Stage out retry 2")
            threadpool = ThreadPool(1)
            failedStageOuts = self.__failedStageOuts
            self.__failedStageOuts = []
            for failedStageOut in failedStageOuts:
                threadpool.add_task(self.stageOutHPCEvent, failedStageOut)
            threadpool.wait_completion()
            self.updateHPCEventRanges()

        self.__job.setHpcStatus('finished')
        self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node,
                                    25443)
        self.__hpcStatus, self.__hpcLog = hpcManager.checkHPCJobLog()
        tolog("HPC job log status: %s, job log error: %s" %
              (self.__hpcStatus, self.__hpcLog))
Ejemplo n.º 55
0
    def setup(self, job, jobSite, thisExperiment):
        """ prepare the setup and get the run command list """

        # start setup time counter
        t0 = time.time()
        ec = 0

        # split up the job parameters to be able to loop over the tasks
        jobParameters = job.jobPars.split("\n")[0]
        jobTrf = job.trf.split("\n")[0]

        parser = optparse.OptionParser(
            description=' program to submit alpgen jobs like a pilot')
        parser.add_option('-p',
                          '--process',
                          dest='process',
                          help='Alpgen Process, i.e. zjet, wjet, wqq, etc.')
        parser.add_option(
            '-n',
            '--nevts',
            dest='nevts',
            help=
            'Number of weighted events requested in input file for weighted event generation',
            type='int')
        parser.add_option(
            '-g',
            '--group-id',
            dest='group_identifier',
            help=
            'User specified string that helps the user group jobs together.')
        parser.add_option('-e',
                          '--ecm',
                          dest='ecm',
                          help='Center of Mass Energy.')
        parser.add_option('-r',
                          '--run-number',
                          dest='run_number',
                          help='Run Number')
        parser.add_option(
            '-c',
            '--jobConfig',
            dest='jobConfig',
            help=
            'Job Options that will used from the Job Config tarball, i.e. MC12JobOptions/MC12.<Run Number>.<description>.py'
        )
        parser.add_option(
            '-j',
            '--evgenJobOpts',
            dest='evgenJobOpts',
            help='Job Config tarball, i.e. MC12JobOpts-XX-YY-ZZ.tar.gz')
        parser.add_option('',
                          '--dev',
                          dest='dev',
                          help='For development only.',
                          action='store_true',
                          default=False)
        parser.add_option(
            '-q',
            '--status-queue',
            dest='enable_status_queue',
            help=
            'Enable the setting of the message queue parameter in the ArgoJob, which means ARGO will not send message updates for this job to the queue with its job ID.',
            action='store_true',
            default=False)
        #parser.add_option('-a','--warmup-evts',dest='warmup_evts',help='For Warmup Step: Three numbers seperated by commas giving the number of events per iteration, number of iterations, and final number of events to generate. Example: "10000,10,1000000"')
        parser.add_option(
            '-b',
            '--evtgen-evts',
            dest='evtgen_evts',
            help=
            'For Event Generation Step: The number of events to generation in the event generation step. The ouput of unweighted events tends to be less so request more than you want. For example W+0jets gives you 70\%, W+1jet gives you 16%, W+2jet gives you 5%, W+3jet gives you 1%, and so on.',
            type='int')
        parser.add_option('-o',
                          '--num-nodes',
                          dest='numnodes',
                          help='number of nodes to use on destination machine',
                          type='int')
        parser.add_option(
            '-u',
            '--ranks-per-node',
            dest='ranks_per_node',
            help='number of MPI ranks per node to use on destination machine',
            type='int')
        parser.add_option(
            '-t',
            '--wall-time',
            dest='walltime',
            help='The wall time to submit to the queue in minutes.',
            type='int')
        parser.add_option(
            '-s',
            '--site',
            dest='site',
            help='Balsam site name on which to run the event generation')
        parser.add_option(
            '-x',
            '--no-submit',
            dest='submit',
            help='do not submit the message to ARGO. For testing purposes.',
            action='store_false',
            default=True)
        parser.add_option(
            '',
            '--wmp-evts-itr',
            dest='wm_evts_per_itr',
            help='Warmup: Number of weighted events per interation.')
        parser.add_option('',
                          '--wmp-nitr',
                          dest='wm_nitr',
                          help='Warmup: Number of iterations')
        parser.add_option('',
                          '--wmp-evts',
                          dest='wm_evts',
                          help='Warmup: Number of final events to produce.')

        try:
            options, args = parser.parse_args(shlex.split(jobParameters))
        except:
            ec = self.__error.ERR_SETUPFAILURE
            job.pilotErrorDiag = "Failure to parse job arguments"
            tolog("Failure to parse job arguments for ARGO job")
            return ec, job

        tolog("ARGO job will be launched with next parameters: %s" %
              jobParameters)

        self.process = options.process
        self.username = '******' % job.prodUserID[:120]  #os.environ['USER']
        self.group_identifier = options.group_identifier
        self.ecm = options.ecm
        self.run_number = options.run_number
        self.job_config = options.jobConfig
        self.evgen_job_opts = options.evgenJobOpts
        self.warmup_phase0_number_events = options.wm_evts_per_itr
        self.warmup_phase0_number_iterations = options.wm_nitr
        self.warmup_phase1_number_events = options.wm_evts
        self.evtgen_phase1_number_events = options.evtgen_evts
        self.evtgen_nodes = options.numnodes
        self.evtgen_processes_per_node = options.ranks_per_node
        self.evtgen_wall_minutes = options.walltime
        self.parallel_site = options.site

        self.dev = options.dev

        self.job_path = os.path.join(self.job_working_path, job.jobId)

        tolog("ARGO job path: %s" % self.job_path)

        self.argo_job = self.get_argo_job(job)

        if options.dev:
            job.serial_site = 'argo_cluster_dev'

        # verify that the multi-trf job is setup properly

        os.chdir(jobSite.workdir)
        tolog("Current job workdir is %s" % os.getcwd())

        job.timeSetup = int(time.time() - t0)
        tolog("Total setup time: %d s" % (job.timeSetup))

        return ec, job
Ejemplo n.º 56
0
    def prepareHPCJob(self):
        #print self.__runCommandList
        #print self.getParentWorkDir()
        #print self.__job.workdir
        # 1. input files
        inputFiles = []
        inputFilesGlobal = []
        for inputFile in self.__job.inFiles:
            #inputFiles.append(os.path.join(self.__job.workdir, inputFile))
            inputFilesGlobal.append(os.path.join(self.__job.workdir,
                                                 inputFile))
            inputFiles.append(os.path.join('HPCWORKINGDIR', inputFile))
        inputFileDict = dict(zip(self.__job.inFilesGuids, inputFilesGlobal))
        self.__inputFilesGlobal = inputFilesGlobal

        tagFiles = {}
        EventFiles = {}
        for guid in inputFileDict:
            if '.TAG.' in inputFileDict[guid]:
                tagFiles[guid] = inputFileDict[guid]
            elif not "DBRelease" in inputFileDict[guid]:
                EventFiles[guid] = {}
                EventFiles[guid]['file'] = inputFileDict[guid]

        # 2. create TAG file
        for guid in EventFiles:
            inFiles = [EventFiles[guid]['file']]
            input_tag_file, input_tag_file_guid = self.createTAGFile(
                self.__runCommandList[0], self.__job.trf, inFiles,
                "MakeRunEventCollection.py")
            if input_tag_file != "" and input_tag_file_guid != "":
                tolog("Will run TokenExtractor on file %s" % (input_tag_file))
                EventFiles[guid]['TAG'] = input_tag_file
                EventFiles[guid]['TAG_guid'] = input_tag_file_guid
            else:
                # only for current test
                if len(tagFiles) > 0:
                    EventFiles[guid]['TAG_guid'] = tagFiles.keys()[0]
                    EventFiles[guid]['TAG'] = tagFiles[tagFiles.keys()[0]]
                else:
                    return -1, "Failed to create the TAG file", None

        # 3. create Pool File Catalog
        inputFileDict = dict(zip(self.__job.inFilesGuids, inputFilesGlobal))
        self.__poolFileCatalog = os.path.join(self.__job.workdir,
                                              "PoolFileCatalog_HPC.xml")
        createPoolFileCatalog(inputFileDict, self.__job.inFiles,
                              self.__poolFileCatalog)
        inputFileDictTemp = dict(zip(self.__job.inFilesGuids, inputFiles))
        self.__poolFileCatalogTemp = os.path.join(self.__job.workdir,
                                                  "PoolFileCatalog_Temp.xml")
        self.__poolFileCatalogTempName = "HPCWORKINGDIR/PoolFileCatalog_Temp.xml"
        createPoolFileCatalog(inputFileDictTemp, self.__job.inFiles,
                              self.__poolFileCatalogTemp)

        # 4. getSetupCommand
        setupCommand = self.stripSetupCommand(self.__runCommandList[0],
                                              self.__job.trf)
        _cmd = re.search('(source.+\;)', setupCommand)
        if _cmd:
            setup = _cmd.group(1)
            source_setup = setup.split(";")[0]
            #setupCommand = setupCommand.replace(source_setup, source_setup + " --cmtextratags=ATLAS,useDBRelease")
            # for test, asetup has a bug
            #new_source_setup = source_setup.split("cmtsite/asetup.sh")[0] + "setup-19.2.0-quick.sh"
            #setupCommand = setupCommand.replace(source_setup, new_source_setup)
        tolog("setup command: " + setupCommand)

        # 5. AthenaMP command
        if not self.__copyInputFiles:
            jobInputFileList = None
            jobInputFileList = ','.join(inputFilesGlobal)
            #for inputFile in self.__job.inFiles:
            #    jobInputFileList = ','.join(os.path.join(self.__job.workdir, inputFile))
            #    self.__runCommandList[0] = self.__runCommandList[0].replace(inputFile, os.path.join(self.__job.workdir, inputFile))
            command_list = self.__runCommandList[0].split(" ")
            command_list_new = []
            for command_part in command_list:
                if command_part.startswith("--input"):
                    command_arg = command_part.split("=")[0]
                    command_part_new = command_arg + "=" + jobInputFileList
                    command_list_new.append(command_part_new)
                else:
                    command_list_new.append(command_part)
            self.__runCommandList[0] = " ".join(command_list_new)

            self.__runCommandList[
                0] += ' --preExec \'from G4AtlasApps.SimFlags import simFlags;simFlags.RunNumber=222222;from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.Strategy="TokenScatterer";from AthenaCommon.AppMgr import ServiceMgr as svcMgr;from AthenaServices.AthenaServicesConf import OutputStreamSequencerSvc;outputStreamSequencerSvc = OutputStreamSequencerSvc();outputStreamSequencerSvc.SequenceIncidentName = "NextEventRange";outputStreamSequencerSvc.IgnoreInputFileBoundary = True;svcMgr += outputStreamSequencerSvc\' '
            self.__runCommandList[
                0] += " '--skipFileValidation' '--checkEventCount=False' '--postExec' 'svcMgr.PoolSvc.ReadCatalog += [\"xmlcatalog_file:%s\"]'" % (
                    self.__poolFileCatalog)
        else:
            self.__runCommandList[
                0] += ' --preExec \'from G4AtlasApps.SimFlags import simFlags;simFlags.RunNumber=222222;from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.Strategy="TokenScatterer";from AthenaCommon.AppMgr import ServiceMgr as svcMgr;from AthenaServices.AthenaServicesConf import OutputStreamSequencerSvc;outputStreamSequencerSvc = OutputStreamSequencerSvc();outputStreamSequencerSvc.SequenceIncidentName = "NextEventRange";outputStreamSequencerSvc.IgnoreInputFileBoundary = True;svcMgr += outputStreamSequencerSvc\' '
            self.__runCommandList[
                0] += " '--skipFileValidation' '--checkEventCount=False' '--postExec' 'svcMgr.PoolSvc.ReadCatalog += [\"xmlcatalog_file:%s\"]'" % (
                    self.__poolFileCatalogTempName)

        # should not have --DBRelease and UserFrontier.py in HPC
        self.__runCommandList[0] = self.__runCommandList[0].replace(
            "--DBRelease=current", "")
        if 'RecJobTransforms/UseFrontier.py,' in self.__runCommandList[0]:
            self.__runCommandList[0] = self.__runCommandList[0].replace(
                'RecJobTransforms/UseFrontier.py,', '')
        if ',RecJobTransforms/UseFrontier.py' in self.__runCommandList[0]:
            self.__runCommandList[0] = self.__runCommandList[0].replace(
                ',RecJobTransforms/UseFrontier.py', '')
        if ' --postInclude=RecJobTransforms/UseFrontier.py ' in self.__runCommandList[
                0]:
            self.__runCommandList[0] = self.__runCommandList[0].replace(
                ' --postInclude=RecJobTransforms/UseFrontier.py ', ' ')

        #self.__runCommandList[0] = self.__runCommandList[0].replace(source_setup, source_setup + " --cmtextratags=ATLAS,useDBRelease --skipFileValidation --checkEventCount=False")
        # for tests, asetup has a bug
        #self.__runCommandList[0] = self.__runCommandList[0].replace(source_setup, new_source_setup)

        self.__runCommandList[
            0] += " 1>athenaMP_stdout.txt 2>athenaMP_stderr.txt"
        self.__runCommandList[0] = self.__runCommandList[0].replace(";;", ";")

        # 6. Token Extractor file list
        # in the token extractor file list, the guid is the Event guid, not the tag guid.
        self.__tagFile = os.path.join(self.__job.workdir,
                                      "TokenExtractor_filelist")
        handle = open(self.__tagFile, 'w')
        for guid in EventFiles:
            tagFile = EventFiles[guid]['TAG']
            line = guid + ",PFN:" + tagFile + "\n"
            handle.write(line)
        handle.close()

        # 7. Token Extractor command
        setup = setupCommand
        self.__tokenExtractorCmd = setup + ";" + " TokenExtractor -v  --source " + self.__tagFile + " 1>tokenExtract_stdout.txt 2>tokenExtract_stderr.txt"
        self.__tokenExtractorCmd = self.__tokenExtractorCmd.replace(";;", ";")
        # special case
        #self.__tokenExtractorCmd = "export LD_LIBRARY_PATH="+source_setup.split("cmtsite/asetup.sh")[0].strip().split(" ")[1]+"/patch/ldpatch/:$LD_LIBRARY_PATH; " + self.__tokenExtractorCmd

        return 0, None, {
            "TokenExtractCmd": self.__tokenExtractorCmd,
            "AthenaMPCmd": self.__runCommandList[0]
        }
Ejemplo n.º 57
0
def updateRunCommandList(runCommandList, pworkdir, jobId, statusPFCTurl, analysisJob, usedFAXandDirectIO, hasInput, prodDBlockToken):
    """ update the run command list if --directIn is no longer needed """
    # the method is using the file state dictionary

    # remove later
    dumpFileStates(pworkdir, jobId, ftype="input")

    # remove any instruction regarding tag file creation for event service jobs
    _runCommandList = []
    for cmd in runCommandList:
        if "--createTAGFileForES" in cmd:
            cmd = cmd.replace("--createTAGFileForES","")
        _runCommandList.append(cmd)
    runCommandList = _runCommandList

    # no need to continue if no input files
    if not hasInput:
        return runCommandList

    # are there only copy_to_scratch transfer modes in the file state dictionary?
    # if so, remove any lingering --directIn instruction
    only_copy_to_scratch = hasOnlyCopyToScratch(pworkdir, jobId)
    if only_copy_to_scratch or 'local' in prodDBlockToken:
#    if hasOnlyCopyToScratch(pworkdir, jobId): # python bug? does not work, have to use previous two lines?
        _runCommandList = []

        if only_copy_to_scratch:
            tolog("There are only copy_to_scratch transfer modes in file state dictionary")
        for cmd in runCommandList:
            # remove the --directIn string if present
            if "--directIn" in cmd:
                tolog("(Removing --directIn instruction from run command since it is not needed)")
                cmd = cmd.replace("--directIn", "")
            # remove the --useFileStager string if present
            if "--useFileStager" in cmd:
                tolog("(Removing --useFileStager instruction from run command since it is not needed)")
                cmd = cmd.replace("--useFileStager", "")
            # remove additional run options if creation of TURL based PFC failed
            if statusPFCTurl == False: # (note: can also be None, so do not use 'if not statusPFCTurl')
                if "--usePFCTurl" in cmd:
                    tolog("(Removing --usePFCTurl instruction from run command since it is not needed)")
                    cmd = cmd.replace(" --usePFCTurl", "")
                if not "--lfcHost" in cmd and analysisJob:
                    tolog("Adding lfcHost to run command")
                    cmd += ' --lfcHost %s' % (readpar('lfchost'))

            tolog("Updated run command: %s" % (cmd))
            _runCommandList.append(cmd)
    else:
        tolog("Nothing to update in run command list related to copy-to-scratch")
        _runCommandList = runCommandList

    # was FAX used as primary site mover in combination with direct I/O?
    if usedFAXandDirectIO == True:
        tolog("Since FAX was used as primary site mover in combination with direct I/O, the run command list need to be updated")
        _runCommandList2 = []

        for cmd in _runCommandList:
            # remove the --lfcHost
            if "--lfcHost" in cmd:
                _lfcHost = ' --lfcHost %s' % (readpar('lfchost'))
                cmd = cmd.replace(_lfcHost, '')
                tolog("(Removed the LFC host:%s)" % (_lfcHost))

            # remove the --oldPrefix
            if "--oldPrefix" in cmd:
                pattern = "(\-\-oldPrefix\ \S+)"
                cmd = removePattern(cmd, pattern)
                tolog("(Removed --oldPrefix pattern)")

            # remove the --newPrefix
            if "--newPrefix" in cmd:
                pattern = "(\-\-newPrefix\ \S+)"
                cmd = removePattern(cmd, pattern)
                tolog("(Removed --newPrefix pattern)")

            # add the --usePFCTurl if not there already
            if not "--usePFCTurl" in cmd and analysisJob:
                cmd += " --usePFCTurl"
                tolog("(Added --usePFCTurl)")

            tolog("Updated run command: %s" % (cmd))
            _runCommandList2.append(cmd)

        _runCommandList = _runCommandList2


    ### new movers quick integration: reuse usedFAXandDirectIO variable with special meaning
    ### to avoid any LFC and prefixes lookups in transformation scripts
    ### since new movers already form proper pfn values
    ### proper workflow is required: to be reimplemented later
    if usedFAXandDirectIO == 'newmover' or usedFAXandDirectIO == 'newmover-directaccess':
        tolog("updateRunCommandList(): use new movers logic")
        tolog("updateRunCommandList(): remove to be deprecated options (--lfcHost, --oldPrefix, --newPrefix) from command list")
        tolog("updateRunCommandList(): force to set --usePFCTurl")
        tolog("updateRunCommandList(): check directaccess mode if need (--directIn)")
        tolog("current runCommandList=%s" % _runCommandList)

        _runCommandList2 = []

        for cmd in _runCommandList:

            # remove the --lfcHost, --oldPrefix, --newPrefix
            # add --usePFCTurl

            if "--lfcHost" in cmd:
                cmd = removePattern(cmd, "(\-\-lfcHost\ \S+)")
                tolog("(Removed the --lfcHost)")

            if "--oldPrefix" in cmd:
                pattern = "(\-\-oldPrefix\ \S+)"
                cmd = removePattern(cmd, pattern)
                tolog("(Removed --oldPrefix pattern)")

            if "--newPrefix" in cmd:
                pattern = "(\-\-newPrefix\ \S+)"
                cmd = removePattern(cmd, pattern)
                tolog("(Removed --newPrefix pattern)")

            if "--usePFCTurl" not in cmd and analysisJob:
                cmd += " --usePFCTurl"
                tolog("(Added --usePFCTurl)")

            # add --directIn if need
            if usedFAXandDirectIO == 'newmover-directaccess':
                if "--directIn" not in cmd and analysisJob:
                    cmd += " --directIn"
                    tolog("(Added --directIn)")

            tolog("Updated run command: %s" % cmd)

            _runCommandList2.append(cmd)

        _runCommandList = _runCommandList2

    tolog("Dumping final input file states")
    dumpFileStates(pworkdir, jobId, ftype="input")

    return _runCommandList
Ejemplo n.º 58
0
    def executePayload(self, thisExperiment, job):

        t0 = os.times()
        res_tuple = None

        # loop over all run commands (only >1 for multi-trfs)
        getstatusoutput_was_interrupted = False
        job_status = None
        tolog("About to launch ARGO job")
        # Poll MQ for Job Status
        try:
            # Initiate MQ interface and send job
            self.argo_job.job_status_routing_key = '%s_job_status' % job.jobId  #'status_' + jobID
            si = SiteInformation()
            mi = MessageInterface()
            mi.host = 'atlasgridftp02.hep.anl.gov'
            mi.port = 5671
            mi.ssl_cert = si.getSSLCertificate(
            )  #'/grid/atlas/hpc/pilot_certs/xrootdsrv-cert.pem'
            proxy_cert_path = si.getSSLCertificate()
            mi.ssl_cert = os.path.dirname(
                proxy_cert_path) + "/rabbitmq-cert.pem"
            if 'X509_USER_CERT' in os.environ.keys():
                mi.ssl_cert = os.environ[
                    'X509_USER_CERT']  #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-cert.pem'

            mi.ssl_key = mi.ssl_cert  #'/grid/atlas/hpc/pilot_certs/xrootdsrv-key.pem'
            mi.ssl_key = os.path.dirname(proxy_cert_path) + "/rabbitmq-key.pem"
            if 'X509_USER_KEY' in os.environ.keys():
                mi.ssl_key = os.environ[
                    'X509_USER_KEY']  #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/xrootdsrv-key.pem'

            #mi.ssl_ca_certs = os.path.dirname(proxy_cert_path) + "/rabbitmq-cacerts.pem"
            mi.ssl_ca_certs = '/grid/atlas/hpc/pilot_certs/cacerts.pem'
            #if 'X509_CA_CERTS' in os.environ.keys():
            #    mi.ssl_ca_certs = os.environ['X509_CA_CERTS'] #'/users/hpcusers/balsam_dev/gridsecurity/jchilders/cacerts.pem'
            #tolog("CA certs: %s" % (mi.ssl_ca_certs))
            ca_certs = os.path.dirname(
                proxy_cert_path) + "/rabbitmq-cacerts.pem"
            if os.path.isfile(ca_certs):
                mi.ssl_ca_certs = ca_certs

            mi.exchange_name = 'argo_users'

            #Create queue to get messages about ARGO Job status from MQ
            tolog('Opening connection with MQ')
            mi.open_blocking_connection()
            tolog('Create queue [%s]  to retrieve messages with job status' %
                  self.argo_job.job_status_routing_key)

            mi.create_queue(self.argo_job.job_status_routing_key,
                            self.argo_job.job_status_routing_key)

            # submit ARGO job to MQ

            #tolog('Opening connection with MQ')
            #mi.open_blocking_connection()
            routing_key = 'argo_job'
            if self.dev:
                routing_key = 'argo_job_dev'
            tolog('Sending msg with job to ARGO')
            mi.send_msg(self.argo_job.serialize(), routing_key)
            tolog(' done sending ')

            # Waiting till job done or failed
            ARGO_err_msg = ''
            while True:
                time.sleep(5)
                message = mi.receive_msg(self.argo_job.job_status_routing_key,
                                         True)
                if message[2]:
                    tolog(
                        "Got message from queue [%s]: method [%s], properties [%s], body [ %s ]"
                        % (self.argo_job.job_status_routing_key, message[0],
                           message[1], message[2]))
                    job_status = ArgoJobStatus.get_from_message(message[2])
                    job.hpcStatus = job_status.state
                    rt = RunJobUtilities.updatePilotServer(
                        job, self.getPilotServer(), self.getPilotPort())

                    tolog("Extracted state: %s" % job_status.state)
                    if job_status.state == job_status.HISTORY:
                        res_tuple = (0, "Done")
                        break
                    elif job_status.is_failed():
                        res_tuple = (1, "Failed")
                        ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message
                    elif job_status.state == job_status.FAILED:
                        res_tuple = (1, "Failed")
                        ARGO_err_msg = ARGO_err_msg + ' ' + job_status.message
                        runJob.failJob(1,
                                       0,
                                       job,
                                       ins=job.inFiles,
                                       pilotErrorDiag=ARGO_err_msg)
                        break
                time.sleep(5)

            mi.close()
            tolog(' closing connection to MQ')

            tolog("Job State: %s" % (job_status.state))
            #job.timeExe = int(fork_job.finished - fork_job.started)

            ####################################################

        except Exception, e:
            tolog("!!FAILED!!3000!! Failed to run command %s" % str(e))
            getstatusoutput_was_interrupted = True
            res_tuple = (1, "Failed")
            self.failJob(0,
                         self.__error.ERR_GENERALERROR,
                         job,
                         pilotErrorDiag=str(e))
Ejemplo n.º 59
0
def removeSkippedFromJobPars(fname, jobPars):
    """ remove skipped input files from jobPars """

    # get the skipped file names from the xml
    skipped = getLFNsFromSkippedXML(fname)

    if skipped == []:
        tolog("Did not find any skipped LFNs in: %s" % (fname))
    else:
        tolog("Removing skipped input files from jobPars")
        tolog("..skipped: %s" % str(skipped))
        tolog("..jobPars:\n%s" % (jobPars))
        for skip in skipped:
            tolog("..Removing: %s" % (skip))
            # try difference styles
            _skip = "\'%s\'," % (skip)
            if _skip in jobPars:
                jobPars = jobPars.replace(_skip,'')
                tolog('..Removed %s from jobPars' % (_skip))
            else:
                _skip = "\'%s\'" % (skip)
                if _skip in jobPars:
                    jobPars = jobPars.replace(_skip,'')
                    tolog('..Removed %s from jobPars' % (_skip))
                else:
                    _skip = "%s," % (skip)
                    if _skip in jobPars:
                        jobPars = jobPars.replace(skip,'')
                        tolog('..Removed %s from jobPars' % (skip))
                    else:
                        if skip in jobPars:
                            jobPars = jobPars.replace(skip,'')
                            print '..Removed %s from jobPars' % (skip)
                        else:
                            # nothing to remove
                            tolog("..Found nothing to remove from jobPars: %s" % (jobPars))
    return jobPars
Ejemplo n.º 60
0
            tolog("Job State: %s" % (job_status.state))
            #job.timeExe = int(fork_job.finished - fork_job.started)

            ####################################################

        except Exception, e:
            tolog("!!FAILED!!3000!! Failed to run command %s" % str(e))
            getstatusoutput_was_interrupted = True
            res_tuple = (1, "Failed")
            self.failJob(0,
                         self.__error.ERR_GENERALERROR,
                         job,
                         pilotErrorDiag=str(e))
        else:
            if res_tuple[0] == 0:
                tolog("ARGO Job finished")
            else:
                tolog("ARGO Job failed: res = %s" % (str(res_tuple)))

        t1 = os.times()
        # CPU consumption metrics
        # t = map(lambda x, y:x-y, t1, t0) # get the time consumed
        # job.cpuConsumptionUnit, job.cpuConsumptionTime, job.cpuConversionFactor = pUtil.setTimeConsumed(t)
        # tolog("Job CPU usage: %s %s" % (job.cpuConsumptionTime, job.cpuConsumptionUnit))
        # tolog("Job CPU conversion factor: %1.10f" % (job.cpuConversionFactor))
        job.timeExe = int(round(t1[4] - t0[4]))

        tolog("Original exit code: %s" % (res_tuple[0]))
        if res_tuple[0] != None:
            tolog("Exit code: %s (returned from OS)" % (res_tuple[0] % 255))
            res0 = res_tuple[0] % 255